mirror of
https://github.com/golang/go.git
synced 2025-05-22 16:09:37 +00:00
os, syscall: support ill-formed UTF-16 strings on Windows
Windows UTF-16 strings can contain unpaired surrogates, which can't be decoded into a valid UTF-8 string. This file defines a set of functions that can be used to encode and decode potentially ill-formed UTF-16 strings by using the [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/). WTF-8 is a strict superset of UTF-8, i.e. any string that is well-formed in UTF-8 is also well-formed in WTF-8 and the content is unchanged. Also, the conversion never fails and is lossless. The benefit of using WTF-8 instead of UTF-8 when decoding a UTF-16 string is that the conversion is lossless even for ill-formed UTF-16 strings. This property allows to read an ill-formed UTF-16 string, convert it to a Go string, and convert it back to the same original UTF-16 string. Fixes #59971 Change-Id: Id6007f6e537844913402b233e73d698688cd5ba6 Reviewed-on: https://go-review.googlesource.com/c/go/+/493036 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Bryan Mills <bcmills@google.com> Run-TryBot: Quim Muntal <quimmuntal@gmail.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Paul Hampson <Paul.Hampson@Pobox.com>
This commit is contained in:
parent
91b8cc0dfa
commit
974236bda9
@ -9,7 +9,6 @@ package execenv
|
|||||||
import (
|
import (
|
||||||
"internal/syscall/windows"
|
"internal/syscall/windows"
|
||||||
"syscall"
|
"syscall"
|
||||||
"unicode/utf16"
|
|
||||||
"unsafe"
|
"unsafe"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -41,7 +40,7 @@ func Default(sys *syscall.SysProcAttr) (env []string, err error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
entry := unsafe.Slice(blockp, (uintptr(end)-uintptr(unsafe.Pointer(blockp)))/2)
|
entry := unsafe.Slice(blockp, (uintptr(end)-uintptr(unsafe.Pointer(blockp)))/2)
|
||||||
env = append(env, string(utf16.Decode(entry)))
|
env = append(env, syscall.UTF16ToString(entry))
|
||||||
blockp = (*uint16)(unsafe.Add(end, size))
|
blockp = (*uint16)(unsafe.Add(end, size))
|
||||||
}
|
}
|
||||||
return
|
return
|
||||||
|
@ -217,7 +217,7 @@ func (k Key) GetStringsValue(name string) (val []string, valtype uint32, err err
|
|||||||
from := 0
|
from := 0
|
||||||
for i, c := range p {
|
for i, c := range p {
|
||||||
if c == 0 {
|
if c == 0 {
|
||||||
val = append(val, string(utf16.Decode(p[from:i])))
|
val = append(val, syscall.UTF16ToString(p[from:i]))
|
||||||
from = i + 1
|
from = i + 1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -7,7 +7,6 @@ package windows
|
|||||||
import (
|
import (
|
||||||
"sync"
|
"sync"
|
||||||
"syscall"
|
"syscall"
|
||||||
"unicode/utf16"
|
|
||||||
"unsafe"
|
"unsafe"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -17,17 +16,13 @@ func UTF16PtrToString(p *uint16) string {
|
|||||||
if p == nil {
|
if p == nil {
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
// Find NUL terminator.
|
|
||||||
end := unsafe.Pointer(p)
|
end := unsafe.Pointer(p)
|
||||||
n := 0
|
n := 0
|
||||||
for *(*uint16)(end) != 0 {
|
for *(*uint16)(end) != 0 {
|
||||||
end = unsafe.Pointer(uintptr(end) + unsafe.Sizeof(*p))
|
end = unsafe.Pointer(uintptr(end) + unsafe.Sizeof(*p))
|
||||||
n++
|
n++
|
||||||
}
|
}
|
||||||
// Turn *uint16 into []uint16.
|
return syscall.UTF16ToString(unsafe.Slice(p, n))
|
||||||
s := unsafe.Slice(p, n)
|
|
||||||
// Decode []uint16 into string.
|
|
||||||
return string(utf16.Decode(s))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const (
|
const (
|
||||||
|
@ -11,7 +11,6 @@ import (
|
|||||||
"runtime"
|
"runtime"
|
||||||
"sync"
|
"sync"
|
||||||
"syscall"
|
"syscall"
|
||||||
"unicode/utf16"
|
|
||||||
"unsafe"
|
"unsafe"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -104,7 +103,7 @@ func (file *File) readdir(n int, mode readdirMode) (names []string, dirents []Di
|
|||||||
d.bufp = 0
|
d.bufp = 0
|
||||||
}
|
}
|
||||||
nameslice := unsafe.Slice(&info.FileName[0], info.FileNameLength/2)
|
nameslice := unsafe.Slice(&info.FileName[0], info.FileNameLength/2)
|
||||||
name := string(utf16.Decode(nameslice))
|
name := syscall.UTF16ToString(nameslice)
|
||||||
if name == "." || name == ".." { // Useless names
|
if name == "." || name == ".." { // Useless names
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
@ -587,7 +587,6 @@ package main
|
|||||||
import (
|
import (
|
||||||
"os"
|
"os"
|
||||||
"syscall"
|
"syscall"
|
||||||
"unicode/utf16"
|
|
||||||
"unsafe"
|
"unsafe"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -599,7 +598,7 @@ func getMyName() (string, error) {
|
|||||||
if n == 0 {
|
if n == 0 {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
return string(utf16.Decode(b[0:n])), nil
|
return syscall.UTF16ToString(b[0:n]), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
|
@ -11,7 +11,6 @@ import (
|
|||||||
"runtime"
|
"runtime"
|
||||||
"sync"
|
"sync"
|
||||||
"syscall"
|
"syscall"
|
||||||
"unicode/utf16"
|
|
||||||
"unsafe"
|
"unsafe"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -259,7 +258,7 @@ func tempDir() string {
|
|||||||
// Otherwise remove terminating \.
|
// Otherwise remove terminating \.
|
||||||
n--
|
n--
|
||||||
}
|
}
|
||||||
return string(utf16.Decode(b[:n]))
|
return syscall.UTF16ToString(b[:n])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -18,6 +18,7 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
"reflect"
|
"reflect"
|
||||||
"runtime"
|
"runtime"
|
||||||
|
"slices"
|
||||||
"sort"
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
"syscall"
|
"syscall"
|
||||||
@ -1377,3 +1378,71 @@ func TestAppExecLinkStat(t *testing.T) {
|
|||||||
t.Errorf("exec.LookPath(%q) = %q; want %q", pythonPath, p, pythonPath)
|
t.Errorf("exec.LookPath(%q) = %q; want %q", pythonPath, p, pythonPath)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestIllformedUTF16FileName(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
const sep = string(os.PathSeparator)
|
||||||
|
if !strings.HasSuffix(dir, sep) {
|
||||||
|
dir += sep
|
||||||
|
}
|
||||||
|
|
||||||
|
// This UTF-16 file name is ill-formed as it contains low surrogates that are not preceded by high surrogates ([1:5]).
|
||||||
|
namew := []uint16{0x2e, 0xdc6d, 0xdc73, 0xdc79, 0xdc73, 0x30, 0x30, 0x30, 0x31, 0}
|
||||||
|
|
||||||
|
// Create a file whose name contains unpaired surrogates.
|
||||||
|
// Use syscall.CreateFile instead of os.Create to simulate a file that is created by
|
||||||
|
// a non-Go program so the file name hasn't gone through syscall.UTF16FromString.
|
||||||
|
dirw := utf16.Encode([]rune(dir))
|
||||||
|
pathw := append(dirw, namew...)
|
||||||
|
fd, err := syscall.CreateFile(&pathw[0], syscall.GENERIC_ALL, 0, nil, syscall.CREATE_NEW, 0, 0)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
syscall.CloseHandle(fd)
|
||||||
|
|
||||||
|
name := syscall.UTF16ToString(namew)
|
||||||
|
path := filepath.Join(dir, name)
|
||||||
|
// Verify that os.Lstat can query the file.
|
||||||
|
fi, err := os.Lstat(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if got := fi.Name(); got != name {
|
||||||
|
t.Errorf("got %q, want %q", got, name)
|
||||||
|
}
|
||||||
|
// Verify that File.Readdirnames lists the file.
|
||||||
|
f, err := os.Open(dir)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
files, err := f.Readdirnames(0)
|
||||||
|
f.Close()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if !slices.Contains(files, name) {
|
||||||
|
t.Error("file not listed")
|
||||||
|
}
|
||||||
|
// Verify that os.RemoveAll can remove the directory
|
||||||
|
// and that it doesn't hang.
|
||||||
|
err = os.RemoveAll(dir)
|
||||||
|
if err != nil {
|
||||||
|
t.Error(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestUTF16Alloc(t *testing.T) {
|
||||||
|
allowsPerRun := func(want int, f func()) {
|
||||||
|
t.Helper()
|
||||||
|
got := int(testing.AllocsPerRun(5, f))
|
||||||
|
if got != want {
|
||||||
|
t.Errorf("got %d allocs, want %d", got, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
allowsPerRun(1, func() {
|
||||||
|
syscall.UTF16ToString([]uint16{'a', 'b', 'c'})
|
||||||
|
})
|
||||||
|
allowsPerRun(1, func() {
|
||||||
|
syscall.UTF16FromString("abc")
|
||||||
|
})
|
||||||
|
}
|
||||||
|
@ -7,7 +7,6 @@
|
|||||||
package syscall
|
package syscall
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"unicode/utf16"
|
|
||||||
"unsafe"
|
"unsafe"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -24,7 +23,7 @@ func Getenv(key string) (value string, found bool) {
|
|||||||
return "", false
|
return "", false
|
||||||
}
|
}
|
||||||
if n <= uint32(len(b)) {
|
if n <= uint32(len(b)) {
|
||||||
return string(utf16.Decode(b[:n])), true
|
return UTF16ToString(b[:n]), true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -90,7 +89,7 @@ func Environ() []string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
entry := unsafe.Slice(envp, (uintptr(end)-uintptr(unsafe.Pointer(envp)))/size)
|
entry := unsafe.Slice(envp, (uintptr(end)-uintptr(unsafe.Pointer(envp)))/size)
|
||||||
r = append(r, string(utf16.Decode(entry)))
|
r = append(r, UTF16ToString(entry))
|
||||||
envp = (*uint16)(unsafe.Add(end, size))
|
envp = (*uint16)(unsafe.Add(end, size))
|
||||||
}
|
}
|
||||||
return r
|
return r
|
||||||
|
@ -9,3 +9,6 @@ var UpdateProcThreadAttribute = updateProcThreadAttribute
|
|||||||
var DeleteProcThreadAttributeList = deleteProcThreadAttributeList
|
var DeleteProcThreadAttributeList = deleteProcThreadAttributeList
|
||||||
|
|
||||||
const PROC_THREAD_ATTRIBUTE_HANDLE_LIST = _PROC_THREAD_ATTRIBUTE_HANDLE_LIST
|
const PROC_THREAD_ATTRIBUTE_HANDLE_LIST = _PROC_THREAD_ATTRIBUTE_HANDLE_LIST
|
||||||
|
|
||||||
|
var EncodeWTF16 = encodeWTF16
|
||||||
|
var DecodeWTF16 = decodeWTF16
|
||||||
|
@ -14,7 +14,6 @@ import (
|
|||||||
"internal/race"
|
"internal/race"
|
||||||
"runtime"
|
"runtime"
|
||||||
"sync"
|
"sync"
|
||||||
"unicode/utf16"
|
|
||||||
"unsafe"
|
"unsafe"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -37,7 +36,8 @@ func StringToUTF16(s string) []uint16 {
|
|||||||
|
|
||||||
// UTF16FromString returns the UTF-16 encoding of the UTF-8 string
|
// UTF16FromString returns the UTF-16 encoding of the UTF-8 string
|
||||||
// s, with a terminating NUL added. If s contains a NUL byte at any
|
// s, with a terminating NUL added. If s contains a NUL byte at any
|
||||||
// location, it returns (nil, EINVAL).
|
// location, it returns (nil, EINVAL). Unpaired surrogates
|
||||||
|
// are encoded using WTF-8.
|
||||||
func UTF16FromString(s string) ([]uint16, error) {
|
func UTF16FromString(s string) ([]uint16, error) {
|
||||||
if bytealg.IndexByteString(s, 0) != -1 {
|
if bytealg.IndexByteString(s, 0) != -1 {
|
||||||
return nil, EINVAL
|
return nil, EINVAL
|
||||||
@ -49,22 +49,37 @@ func UTF16FromString(s string) ([]uint16, error) {
|
|||||||
// equal than the number of UTF-16 code units.
|
// equal than the number of UTF-16 code units.
|
||||||
// Also account for the terminating NUL character.
|
// Also account for the terminating NUL character.
|
||||||
buf := make([]uint16, 0, len(s)+1)
|
buf := make([]uint16, 0, len(s)+1)
|
||||||
for _, r := range s {
|
buf = encodeWTF16(s, buf)
|
||||||
buf = utf16.AppendRune(buf, r)
|
return append(buf, 0), nil
|
||||||
}
|
|
||||||
return utf16.AppendRune(buf, '\x00'), nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// UTF16ToString returns the UTF-8 encoding of the UTF-16 sequence s,
|
// UTF16ToString returns the UTF-8 encoding of the UTF-16 sequence s,
|
||||||
// with a terminating NUL removed.
|
// with a terminating NUL removed. Unpaired surrogates are decoded
|
||||||
|
// using WTF-8 instead of UTF-8 encoding.
|
||||||
func UTF16ToString(s []uint16) string {
|
func UTF16ToString(s []uint16) string {
|
||||||
|
maxLen := 0
|
||||||
for i, v := range s {
|
for i, v := range s {
|
||||||
if v == 0 {
|
if v == 0 {
|
||||||
s = s[0:i]
|
s = s[0:i]
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
switch {
|
||||||
|
case v <= rune1Max:
|
||||||
|
maxLen += 1
|
||||||
|
case v <= rune2Max:
|
||||||
|
maxLen += 2
|
||||||
|
default:
|
||||||
|
// r is a non-surrogate that decodes to 3 bytes,
|
||||||
|
// or is an unpaired surrogate (also 3 bytes in WTF-8),
|
||||||
|
// or is one half of a valid surrogate pair.
|
||||||
|
// If it is half of a pair, we will add 3 for the second surrogate
|
||||||
|
// (total of 6) and overestimate by 2 bytes for the pair,
|
||||||
|
// since the resulting rune only requires 4 bytes.
|
||||||
|
maxLen += 3
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return string(utf16.Decode(s))
|
buf := decodeWTF16(s, make([]byte, 0, maxLen))
|
||||||
|
return unsafe.String(unsafe.SliceData(buf), len(buf))
|
||||||
}
|
}
|
||||||
|
|
||||||
// utf16PtrToString is like UTF16ToString, but takes *uint16
|
// utf16PtrToString is like UTF16ToString, but takes *uint16
|
||||||
@ -73,17 +88,13 @@ func utf16PtrToString(p *uint16) string {
|
|||||||
if p == nil {
|
if p == nil {
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
// Find NUL terminator.
|
|
||||||
end := unsafe.Pointer(p)
|
end := unsafe.Pointer(p)
|
||||||
n := 0
|
n := 0
|
||||||
for *(*uint16)(end) != 0 {
|
for *(*uint16)(end) != 0 {
|
||||||
end = unsafe.Pointer(uintptr(end) + unsafe.Sizeof(*p))
|
end = unsafe.Pointer(uintptr(end) + unsafe.Sizeof(*p))
|
||||||
n++
|
n++
|
||||||
}
|
}
|
||||||
// Turn *uint16 into []uint16.
|
return UTF16ToString(unsafe.Slice(p, n))
|
||||||
s := unsafe.Slice(p, n)
|
|
||||||
// Decode []uint16 into string.
|
|
||||||
return string(utf16.Decode(s))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// StringToUTF16Ptr returns pointer to the UTF-16 encoding of
|
// StringToUTF16Ptr returns pointer to the UTF-16 encoding of
|
||||||
@ -97,6 +108,7 @@ func StringToUTF16Ptr(s string) *uint16 { return &StringToUTF16(s)[0] }
|
|||||||
// UTF16PtrFromString returns pointer to the UTF-16 encoding of
|
// UTF16PtrFromString returns pointer to the UTF-16 encoding of
|
||||||
// the UTF-8 string s, with a terminating NUL added. If s
|
// the UTF-8 string s, with a terminating NUL added. If s
|
||||||
// contains a NUL byte at any location, it returns (nil, EINVAL).
|
// contains a NUL byte at any location, it returns (nil, EINVAL).
|
||||||
|
// Unpaired surrogates are encoded using WTF-8.
|
||||||
func UTF16PtrFromString(s string) (*uint16, error) {
|
func UTF16PtrFromString(s string) (*uint16, error) {
|
||||||
a, err := UTF16FromString(s)
|
a, err := UTF16FromString(s)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -143,7 +155,7 @@ func (e Errno) Error() string {
|
|||||||
// trim terminating \r and \n
|
// trim terminating \r and \n
|
||||||
for ; n > 0 && (b[n-1] == '\n' || b[n-1] == '\r'); n-- {
|
for ; n > 0 && (b[n-1] == '\n' || b[n-1] == '\r'); n-- {
|
||||||
}
|
}
|
||||||
return string(utf16.Decode(b[:n]))
|
return UTF16ToString(b[:n])
|
||||||
}
|
}
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@ -525,7 +537,7 @@ func Getwd() (wd string, err error) {
|
|||||||
if e != nil {
|
if e != nil {
|
||||||
return "", e
|
return "", e
|
||||||
}
|
}
|
||||||
return string(utf16.Decode(b[0:n])), nil
|
return UTF16ToString(b[0:n]), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func Chdir(path string) (err error) {
|
func Chdir(path string) (err error) {
|
||||||
@ -573,13 +585,13 @@ func Rename(oldpath, newpath string) (err error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func ComputerName() (name string, err error) {
|
func ComputerName() (name string, err error) {
|
||||||
var n uint32 = MAX_COMPUTERNAME_LENGTH + 1
|
b := make([]uint16, MAX_COMPUTERNAME_LENGTH+1)
|
||||||
b := make([]uint16, n)
|
var n uint32
|
||||||
e := GetComputerName(&b[0], &n)
|
e := GetComputerName(&b[0], &n)
|
||||||
if e != nil {
|
if e != nil {
|
||||||
return "", e
|
return "", e
|
||||||
}
|
}
|
||||||
return string(utf16.Decode(b[0:n])), nil
|
return UTF16ToString(b[:n]), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func Ftruncate(fd Handle, length int64) (err error) {
|
func Ftruncate(fd Handle, length int64) (err error) {
|
||||||
|
92
src/syscall/wtf8_windows.go
Normal file
92
src/syscall/wtf8_windows.go
Normal file
@ -0,0 +1,92 @@
|
|||||||
|
// Copyright 2023 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
// Windows UTF-16 strings can contain unpaired surrogates, which can't be
|
||||||
|
// decoded into a valid UTF-8 string. This file defines a set of functions
|
||||||
|
// that can be used to encode and decode potentially ill-formed UTF-16 strings
|
||||||
|
// by using the [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/).
|
||||||
|
//
|
||||||
|
// WTF-8 is a strict superset of UTF-8, i.e. any string that is
|
||||||
|
// well-formed in UTF-8 is also well-formed in WTF-8 and the content
|
||||||
|
// is unchanged. Also, the conversion never fails and is lossless.
|
||||||
|
//
|
||||||
|
// The benefit of using WTF-8 instead of UTF-8 when decoding a UTF-16 string
|
||||||
|
// is that the conversion is lossless even for ill-formed UTF-16 strings.
|
||||||
|
// This property allows to read an ill-formed UTF-16 string, convert it
|
||||||
|
// to a Go string, and convert it back to the same original UTF-16 string.
|
||||||
|
//
|
||||||
|
// See go.dev/issues/59971 for more info.
|
||||||
|
|
||||||
|
package syscall
|
||||||
|
|
||||||
|
import (
|
||||||
|
"unicode/utf16"
|
||||||
|
"unicode/utf8"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
surr1 = 0xd800
|
||||||
|
surr2 = 0xdc00
|
||||||
|
surr3 = 0xe000
|
||||||
|
|
||||||
|
tx = 0b10000000
|
||||||
|
t3 = 0b11100000
|
||||||
|
maskx = 0b00111111
|
||||||
|
mask3 = 0b00001111
|
||||||
|
|
||||||
|
rune1Max = 1<<7 - 1
|
||||||
|
rune2Max = 1<<11 - 1
|
||||||
|
)
|
||||||
|
|
||||||
|
// encodeWTF16 returns the potentially ill-formed
|
||||||
|
// UTF-16 encoding of s.
|
||||||
|
func encodeWTF16(s string, buf []uint16) []uint16 {
|
||||||
|
for i := 0; i < len(s); {
|
||||||
|
// Cannot use 'for range s' because it expects valid
|
||||||
|
// UTF-8 runes.
|
||||||
|
r, size := utf8.DecodeRuneInString(s[i:])
|
||||||
|
if r == utf8.RuneError {
|
||||||
|
// Check if s[i:] contains a valid WTF-8 encoded surrogate.
|
||||||
|
if sc := s[i:]; len(sc) >= 3 && sc[0] == 0xED && 0xA0 <= sc[1] && sc[1] <= 0xBF && 0x80 <= sc[2] && sc[2] <= 0xBF {
|
||||||
|
r = rune(sc[0]&mask3)<<12 + rune(sc[1]&maskx)<<6 + rune(sc[2]&maskx)
|
||||||
|
buf = append(buf, uint16(r))
|
||||||
|
i += 3
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
i += size
|
||||||
|
buf = utf16.AppendRune(buf, r)
|
||||||
|
}
|
||||||
|
return buf
|
||||||
|
}
|
||||||
|
|
||||||
|
// decodeWTF16 returns the WTF-8 encoding of
|
||||||
|
// the potentially ill-formed UTF-16 s.
|
||||||
|
func decodeWTF16(s []uint16, buf []byte) []byte {
|
||||||
|
for i := 0; i < len(s); i++ {
|
||||||
|
var ar rune
|
||||||
|
switch r := s[i]; {
|
||||||
|
case r < surr1, surr3 <= r:
|
||||||
|
// normal rune
|
||||||
|
ar = rune(r)
|
||||||
|
case surr1 <= r && r < surr2 && i+1 < len(s) &&
|
||||||
|
surr2 <= s[i+1] && s[i+1] < surr3:
|
||||||
|
// valid surrogate sequence
|
||||||
|
ar = utf16.DecodeRune(rune(r), rune(s[i+1]))
|
||||||
|
i++
|
||||||
|
default:
|
||||||
|
// WTF-8 fallback.
|
||||||
|
// This only handles the 3-byte case of utf8.AppendRune,
|
||||||
|
// as surrogates always fall in that case.
|
||||||
|
ar = rune(r)
|
||||||
|
if ar > utf8.MaxRune {
|
||||||
|
ar = utf8.RuneError
|
||||||
|
}
|
||||||
|
buf = append(buf, t3|byte(ar>>12), tx|byte(ar>>6)&maskx, tx|byte(ar)&maskx)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
buf = utf8.AppendRune(buf, ar)
|
||||||
|
}
|
||||||
|
return buf
|
||||||
|
}
|
200
src/syscall/wtf8_windows_test.go
Normal file
200
src/syscall/wtf8_windows_test.go
Normal file
@ -0,0 +1,200 @@
|
|||||||
|
// Copyright 2023 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package syscall_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"slices"
|
||||||
|
"syscall"
|
||||||
|
"testing"
|
||||||
|
"unicode/utf16"
|
||||||
|
"unicode/utf8"
|
||||||
|
"unsafe"
|
||||||
|
)
|
||||||
|
|
||||||
|
var wtf8tests = []struct {
|
||||||
|
str string
|
||||||
|
wstr []uint16
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
str: "\x00",
|
||||||
|
wstr: []uint16{0x00},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
str: "\x5C",
|
||||||
|
wstr: []uint16{0x5C},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
str: "\x7F",
|
||||||
|
wstr: []uint16{0x7F},
|
||||||
|
},
|
||||||
|
|
||||||
|
// 2-byte
|
||||||
|
{
|
||||||
|
str: "\xC2\x80",
|
||||||
|
wstr: []uint16{0x80},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
str: "\xD7\x8A",
|
||||||
|
wstr: []uint16{0x05CA},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
str: "\xDF\xBF",
|
||||||
|
wstr: []uint16{0x07FF},
|
||||||
|
},
|
||||||
|
|
||||||
|
// 3-byte
|
||||||
|
{
|
||||||
|
str: "\xE0\xA0\x80",
|
||||||
|
wstr: []uint16{0x0800},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
str: "\xE2\xB0\xBC",
|
||||||
|
wstr: []uint16{0x2C3C},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
str: "\xEF\xBF\xBF",
|
||||||
|
wstr: []uint16{0xFFFF},
|
||||||
|
},
|
||||||
|
// unmatched surrogate halves
|
||||||
|
// high surrogates: 0xD800 to 0xDBFF
|
||||||
|
{
|
||||||
|
str: "\xED\xA0\x80",
|
||||||
|
wstr: []uint16{0xD800},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// "High surrogate followed by another high surrogate"
|
||||||
|
str: "\xED\xA0\x80\xED\xA0\x80",
|
||||||
|
wstr: []uint16{0xD800, 0xD800},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// "High surrogate followed by a symbol that is not a surrogate"
|
||||||
|
str: string([]byte{0xED, 0xA0, 0x80, 0xA}),
|
||||||
|
wstr: []uint16{0xD800, 0xA},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// "Unmatched high surrogate, followed by a surrogate pair, followed by an unmatched high surrogate"
|
||||||
|
str: string([]byte{0xED, 0xA0, 0x80, 0xF0, 0x9D, 0x8C, 0x86, 0xED, 0xA0, 0x80}),
|
||||||
|
wstr: []uint16{0xD800, 0xD834, 0xDF06, 0xD800},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
str: "\xED\xA6\xAF",
|
||||||
|
wstr: []uint16{0xD9AF},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
str: "\xED\xAF\xBF",
|
||||||
|
wstr: []uint16{0xDBFF},
|
||||||
|
},
|
||||||
|
// low surrogates: 0xDC00 to 0xDFFF
|
||||||
|
{
|
||||||
|
str: "\xED\xB0\x80",
|
||||||
|
wstr: []uint16{0xDC00},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// "Low surrogate followed by another low surrogate"
|
||||||
|
str: "\xED\xB0\x80\xED\xB0\x80",
|
||||||
|
wstr: []uint16{0xDC00, 0xDC00},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// "Low surrogate followed by a symbol that is not a surrogate"
|
||||||
|
str: string([]byte{0xED, 0xB0, 0x80, 0xA}),
|
||||||
|
wstr: []uint16{0xDC00, 0xA},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
// "Unmatched low surrogate, followed by a surrogate pair, followed by an unmatched low surrogate"
|
||||||
|
str: string([]byte{0xED, 0xB0, 0x80, 0xF0, 0x9D, 0x8C, 0x86, 0xED, 0xB0, 0x80}),
|
||||||
|
wstr: []uint16{0xDC00, 0xD834, 0xDF06, 0xDC00},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
str: "\xED\xBB\xAE",
|
||||||
|
wstr: []uint16{0xDEEE},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
str: "\xED\xBF\xBF",
|
||||||
|
wstr: []uint16{0xDFFF},
|
||||||
|
},
|
||||||
|
|
||||||
|
// 4-byte
|
||||||
|
{
|
||||||
|
str: "\xF0\x90\x80\x80",
|
||||||
|
wstr: []uint16{0xD800, 0xDC00},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
str: "\xF0\x9D\x8C\x86",
|
||||||
|
wstr: []uint16{0xD834, 0xDF06},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
str: "\xF4\x8F\xBF\xBF",
|
||||||
|
wstr: []uint16{0xDBFF, 0xDFFF},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestWTF16Rountrip(t *testing.T) {
|
||||||
|
for _, tt := range wtf8tests {
|
||||||
|
t.Run(fmt.Sprintf("%X", tt.str), func(t *testing.T) {
|
||||||
|
got := syscall.EncodeWTF16(tt.str, nil)
|
||||||
|
got2 := string(syscall.DecodeWTF16(got, nil))
|
||||||
|
if got2 != tt.str {
|
||||||
|
t.Errorf("got:\n%s\nwant:\n%s", got2, tt.str)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestWTF16Golden(t *testing.T) {
|
||||||
|
for _, tt := range wtf8tests {
|
||||||
|
t.Run(fmt.Sprintf("%X", tt.str), func(t *testing.T) {
|
||||||
|
got := syscall.EncodeWTF16(tt.str, nil)
|
||||||
|
if !slices.Equal(got, tt.wstr) {
|
||||||
|
t.Errorf("got:\n%v\nwant:\n%v", got, tt.wstr)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func FuzzEncodeWTF16(f *testing.F) {
|
||||||
|
for _, tt := range wtf8tests {
|
||||||
|
f.Add(tt.str)
|
||||||
|
}
|
||||||
|
f.Fuzz(func(t *testing.T, b string) {
|
||||||
|
// test that there are no panics
|
||||||
|
got := syscall.EncodeWTF16(b, nil)
|
||||||
|
syscall.DecodeWTF16(got, nil)
|
||||||
|
if utf8.ValidString(b) {
|
||||||
|
// if the input is a valid UTF-8 string, then
|
||||||
|
// test that syscall.EncodeWTF16 behaves as
|
||||||
|
// utf16.Encode
|
||||||
|
want := utf16.Encode([]rune(b))
|
||||||
|
if !slices.Equal(got, want) {
|
||||||
|
t.Errorf("got:\n%v\nwant:\n%v", got, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func FuzzDecodeWTF16(f *testing.F) {
|
||||||
|
for _, tt := range wtf8tests {
|
||||||
|
b := unsafe.Slice((*uint8)(unsafe.Pointer(unsafe.SliceData(tt.wstr))), len(tt.wstr)*2)
|
||||||
|
f.Add(b)
|
||||||
|
}
|
||||||
|
f.Fuzz(func(t *testing.T, b []byte) {
|
||||||
|
u16 := unsafe.Slice((*uint16)(unsafe.Pointer(unsafe.SliceData(b))), len(b)/2)
|
||||||
|
got := syscall.DecodeWTF16(u16, nil)
|
||||||
|
if utf8.Valid(got) {
|
||||||
|
// if the input is a valid UTF-8 string, then
|
||||||
|
// test that syscall.DecodeWTF16 behaves as
|
||||||
|
// utf16.Decode
|
||||||
|
want := utf16.Decode(u16)
|
||||||
|
if string(got) != string(want) {
|
||||||
|
t.Errorf("got:\n%s\nwant:\n%s", string(got), string(want))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// WTF-8 should always roundtrip
|
||||||
|
got2 := syscall.EncodeWTF16(string(got), nil)
|
||||||
|
if !slices.Equal(got2, u16) {
|
||||||
|
t.Errorf("got:\n%v\nwant:\n%v", got2, u16)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user