Last active
January 15, 2016 10:17
-
-
Save zshamrock/7bb49d5f10d93a2b2580 to your computer and use it in GitHub Desktop.
Unicode and UTF Talk https://slides.com/aliaksandrkazlou/unicode-and-utf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
General: | |
http://www.joelonsoftware.com/articles/Unicode.html | |
Unicode: | |
http://www.unicode.org/standard/WhatIsUnicode.html | |
http://www.unicode.org/history/unicode88.pdf | |
http://unicode.org/charts/ | |
http://unicode.org/cldr/utility/character.jsp | |
https://en.wikipedia.org/wiki/List_of_Unicode_characters | |
UTF: | |
http://www.utf-8.com/ | |
http://www.cl.cam.ac.uk/~mgk25/ucs/utf-8-history.txt | |
http://www.ietf.org/rfc/rfc3629.txt | |
http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G7404 | |
http://unicode.org/faq/utf_bom.html | |
Java: | |
http://docs.oracle.com/javase/7/docs/technotes/guides/intl/overview.html | |
http://www.oracle.com/technetwork/articles/java/supplementary-142654.html | |
Go: | |
https://blog.golang.org/strings | |
https://github.com/paulrosania/go-charset | |
Normalization: | |
https://blog.golang.org/normalization | |
https://docs.oracle.com/javase/8/docs/api/java/text/Normalizer.html | |
http://en.wikipedia.org/wiki/Unicode_equivalence | |
http://unicode.org/reports/tr15/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
// https://play.golang.org/p/XydB2AQ1dV | |
import ( | |
"fmt" | |
"unicode/utf8" | |
"unicode/utf16" | |
"bytes" | |
) | |
func main() { | |
r := int32(0x1030c) | |
r1, r2 := utf16.EncodeRune(r) | |
fmt.Printf("%U %U %v %v %v\n", r1, r2, utf16.IsSurrogate(r), utf16.IsSurrogate(r1), utf16.IsSurrogate(r2)) // => U+D800 U+DF0C false true true | |
buf := bytes.Buffer{} | |
buf.WriteRune(r1) | |
buf.WriteRune(r2) | |
fmt.Printf("U+1030C is %s %X %X\n", buf.String(), r1, r2) // => U+1030C is �� D800 DF0C | |
bb := make([]byte, 4) | |
utf8.EncodeRune(bb, r) | |
buf.Reset() | |
buf.Write(bb) | |
fmt.Printf("U+1030C is %s, raw bytes %X %v\n", buf.String(), bb, bb) // => U+1030C is 𐌌, raw bytes F0908C8C [240 144 140 140] | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package experiments; | |
import java.io.PrintStream; | |
import java.nio.charset.StandardCharsets; | |
import java.text.Normalizer; | |
public class Unicode { | |
public static void main(String[] args) { | |
System.out.printf("%X%n", Character.codePointAt("世界", 0)); // => 4E16 | |
System.out.printf("%X%n", Character.codePointAt("世界", 1)); // => 754C | |
byte[] helloWorld = new byte[]{0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x2c, 0x20, | |
(byte) 0xe4, (byte) 0xb8, (byte) 0x96, (byte) 0xe7, (byte) 0x95, (byte) 0x8c}; | |
System.out.printf("Byte array length: %d\n\n", helloWorld.length); // => Byte array length: 13 | |
String utf8 = new String(helloWorld, StandardCharsets.UTF_8); | |
print("UTF-8", utf8); // => UTF-8 : Hello, 世界 Length 9 | |
String ascii = new String(helloWorld, StandardCharsets.US_ASCII); | |
print("ASCII", ascii); // => ASCII : Hello, ������ Length 13 | |
String utf16 = new String(helloWorld, StandardCharsets.UTF_16); | |
print("UTF-16", utf16); // => UTF-16 : 䡥汬漬⃤뢖� Length 7 | |
byte[] helloWorld16 = new byte[]{0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x2c, 0x00, 0x20, | |
0x4e, 0x16, 0x75, 0x4c}; | |
String utf16Correct = new String(helloWorld16, StandardCharsets.UTF_16); | |
print("UTF-16", utf16Correct); // => UTF-16 : Hello, 世界 Length 9 | |
byte[] helloWorld16BE = new byte[]{(byte)0xfe, (byte)0xff, 0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x2c, 0x00, 0x20, | |
0x4e, 0x16, 0x75, 0x4c}; | |
String utf16BE1 = new String(helloWorld16BE, StandardCharsets.UTF_16); | |
print("UTF-16 BE1", utf16BE1); // => UTF-16 BE1: Hello, 世界 Length 9 | |
String utf16BE2 = new String(helloWorld16BE, StandardCharsets.UTF_16BE); | |
print("UTF-16 BE2", utf16BE2); // => UTF-16 BE2: Hello, 世界 Length 10 | |
String utf16LE1 = new String(helloWorld16BE, StandardCharsets.UTF_16LE); | |
print("UTF-16 LE1", utf16LE1); // => UTF-16 LE1: �䠀攀氀氀漀Ⰰ ᙎ䱵 Length 10 | |
byte[] helloWorld16LE = new byte[]{(byte)0xff, (byte)0xfe, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x2c, 0x00, 0x20, 0x00, | |
0x16, 0x4e, 0x4c, 0x75}; | |
String utf16LE2 = new String(helloWorld16LE, StandardCharsets.UTF_16); | |
print("UTF-16 LE2", utf16LE2); // => UTF-16 LE2: Hello, 世界 Length 9 | |
String utf16LE3 = new String(helloWorld16LE, StandardCharsets.UTF_16LE); | |
print("UTF-16 LE3", utf16LE3); // => UTF-16 LE3: Hello, 世界 Length 10 | |
// every char is 2 bytes (16 bit) | |
System.out.printf("Hello, world is %s\n", "\u0048\u0065\u006c\u006c\u006f\u002c\u0020\u4e16\u754c"); // => Hello, world is Hello, 世界 | |
System.out.printf("World is %s\n", "世界"); // => World is 世界 | |
// surrogates | |
System.out.println(new String(new byte[] {(byte) 0xd8, 0x00, (byte) 0xdf, 0x0c}, StandardCharsets.UTF_16)); | |
System.out.println("\ud800\udf02"); | |
char hs = Character.highSurrogate(0x10302); | |
char ls = Character.lowSurrogate(0x10302); | |
System.out.printf("%x %x\n", (int) hs, (int) ls); // => d800 df02 | |
// normalization | |
String accentA = "\u00c1"; | |
String decomposedAccentA = "\u0041\u0301"; | |
System.out.printf("%s logical eq %s, but alone \u0041, and accent alone \u0301, but eq? %b\n", accentA, decomposedAccentA, accentA.equals(decomposedAccentA)); // => Á logical eq Á, but alone A, and accent alone ́, but eq? false | |
String ffi = "\ufb03"; | |
String compatibleDecomposedFFI = "\u0066\u0066\u0069"; | |
System.out.printf("%s logical/semantically eq %s, but eq? %b\n", ffi, compatibleDecomposedFFI, ffi.equals(compatibleDecomposedFFI)); // => ffi logical/semantically eq ffi, but eq? false | |
System.out.println(Normalizer.normalize(accentA, Normalizer.Form.NFD).length()); // => 2 | |
System.out.println(Normalizer.normalize(accentA, Normalizer.Form.NFC).length()); // => 1 | |
System.out.println(Normalizer.normalize(decomposedAccentA, Normalizer.Form.NFD).length()); // => 2 | |
System.out.println(Normalizer.normalize(decomposedAccentA, Normalizer.Form.NFC).length()); // => 1 | |
System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFKD).length()); // => 3 | |
System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFKC).length()); // => 3 | |
System.out.println(Normalizer.normalize(compatibleDecomposedFFI, Normalizer.Form.NFKD).length()); // => 3 | |
System.out.println(Normalizer.normalize(compatibleDecomposedFFI, Normalizer.Form.NFKC).length()); // => 3 | |
System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFD).length()); // => 1 | |
System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFC).length()); // => 1 | |
System.out.println(Normalizer.normalize(compatibleDecomposedFFI, Normalizer.Form.NFD).length()); // => 3 | |
System.out.println(Normalizer.normalize(compatibleDecomposedFFI, Normalizer.Form.NFC).length()); // => 3 | |
System.out.println(accentA.equals(Normalizer.normalize(decomposedAccentA, Normalizer.Form.NFC))); // => true | |
System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFKC).equals(Normalizer.normalize(compatibleDecomposedFFI, Normalizer.Form.NFKC))); // => true | |
System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFKC)); // => ffi | |
System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFKD)); // => ffi | |
} | |
private static PrintStream print(String charset, String text) { | |
return System.out.printf("%-10s: %-12s Length %d\n", charset, text, text.length()); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"unicode/utf8" | |
) | |
func main() { | |
world := "世界" | |
fmt.Println(len(world)) // => 6 | |
fmt.Println(utf8.RuneCountInString(world)) // => 2 | |
fmt.Printf("% X\n", world) // => E4 B8 96 E7 95 8C | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment