-
-
Save bradleypeabody/185b1d7ed6c0c2ab6cec to your computer and use it in GitHub Desktop.
package main | |
// http://play.golang.org/p/fVf7duRtdH | |
import "fmt" | |
import "unicode/utf16" | |
import "unicode/utf8" | |
import "bytes" | |
func main() { | |
b := []byte{ | |
0xff, // BOM | |
0xfe, // BOM | |
'T', | |
0x00, | |
'E', | |
0x00, | |
'S', | |
0x00, | |
'T', | |
0x00, | |
0x6C, | |
0x34, | |
'\n', | |
0x00, | |
} | |
s, err := DecodeUTF16(b) | |
if err != nil { | |
panic(err) | |
} | |
fmt.Println(s) | |
} | |
func DecodeUTF16(b []byte) (string, error) { | |
if len(b)%2 != 0 { | |
return "", fmt.Errorf("Must have even length byte slice") | |
} | |
u16s := make([]uint16, 1) | |
ret := &bytes.Buffer{} | |
b8buf := make([]byte, 4) | |
lb := len(b) | |
for i := 0; i < lb; i += 2 { | |
u16s[0] = uint16(b[i]) + (uint16(b[i+1]) << 8) | |
r := utf16.Decode(u16s) | |
n := utf8.EncodeRune(b8buf, r[0]) | |
ret.Write(b8buf[:n]) | |
} | |
return ret.String(), nil | |
} |
Thanks very much! helped a lot~
Life saver! You are amazing!
Thank you!
Incorrect result when decoding any surrogate pair, should take care of the high/low surrogate range.
A quick fix to increase u16s size to 2 u16s := make([]uint16, 2)
and:
if u16s[0] >= 0xD800 && u16s[0] <= 0xE000 {
log.Println("lead")
i = i + 2
u16s[1] = uint16(b[i]) + (uint16(b[i+1]) << 8)
}
golang
already has support for decoding []byte
into []uint16
(respecting the endianness):
func DecodeUtf16(b []byte, order binary.ByteOrder) (string, error) {
ints := make([]uint16, len(b)/2)
if err := binary.Read(bytes.NewReader(b), order, &ints); err != nil {
return "", err
}
return string(utf16.Decode(ints)), nil
}
@akirabbq @ik5
complete solution (which also works with surrogate pairs): utf16.go
from the blog of http://angelonotes.blogspot.com/2015/09/golang-utf16-utf8.html
bs_UTF16LE, _, _ := transform.Bytes(unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM).NewEncoder(), []byte("測試"))
bs_UTF16BE, _, _ := transform.Bytes(unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM).NewEncoder(), []byte("測試"))
bs_UTF8LE, _, _ := transform.Bytes(unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM).NewDecoder(), bs_UTF16LE)
bs_UTF8BE, _, _ := transform.Bytes(unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM).NewDecoder(), bs_UTF16BE)
Saved me a lot, thank you
Thanks very much!
golang
already has support for decoding[]byte
into[]uint16
(respecting the endianness):func DecodeUtf16(b []byte, order binary.ByteOrder) (string, error) { ints := make([]uint16, len(b)/2) if err := binary.Read(bytes.NewReader(b), order, &ints); err != nil { return "", err } return string(utf16.Decode(ints)), nil }@akirabbq @ik5
complete solution (which also works with surrogate pairs): utf16.go
You send just function I need to convert clob Oracle data to string.
Life saver, thanks
This helped me to decode UTF-16LE to UTF-8: https://blog.fearcat.in/a?ID=00001-1bd90844-ce0c-4fac-9b8f-fe3d8a30451d
decoder := unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM).NewDecoder()
utf8bytes, err := decoder.Bytes(data) // data contains UTF16LE as read from a file
This code is for little endian .
For big endian, change the code of line 50 like so:
u16s[0] = uint16(b[i+1]) + (uint16(b[i]) << 8)
I'm looking for an idea to figure out from the BOM (two first bytes) the endianness, so it will be automatic using this code.