|
package experiments; |
|
|
|
import java.io.PrintStream; |
|
import java.nio.charset.StandardCharsets; |
|
import java.text.Normalizer; |
|
|
|
public class Unicode { |
|
|
|
public static void main(String[] args) { |
|
System.out.printf("%X%n", Character.codePointAt("世界", 0)); // => 4E16 |
|
System.out.printf("%X%n", Character.codePointAt("世界", 1)); // => 754C |
|
byte[] helloWorld = new byte[]{0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x2c, 0x20, |
|
(byte) 0xe4, (byte) 0xb8, (byte) 0x96, (byte) 0xe7, (byte) 0x95, (byte) 0x8c}; |
|
System.out.printf("Byte array length: %d\n\n", helloWorld.length); // => Byte array length: 13 |
|
|
|
String utf8 = new String(helloWorld, StandardCharsets.UTF_8); |
|
print("UTF-8", utf8); // => UTF-8 : Hello, 世界 Length 9 |
|
|
|
String ascii = new String(helloWorld, StandardCharsets.US_ASCII); |
|
print("ASCII", ascii); // => ASCII : Hello, ������ Length 13 |
|
|
|
String utf16 = new String(helloWorld, StandardCharsets.UTF_16); |
|
print("UTF-16", utf16); // => UTF-16 : 䡥汬漬⃤뢖� Length 7 |
|
|
|
byte[] helloWorld16 = new byte[]{0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x2c, 0x00, 0x20, |
|
0x4e, 0x16, 0x75, 0x4c}; |
|
String utf16Correct = new String(helloWorld16, StandardCharsets.UTF_16); |
|
print("UTF-16", utf16Correct); // => UTF-16 : Hello, 世界 Length 9 |
|
|
|
byte[] helloWorld16BE = new byte[]{(byte)0xfe, (byte)0xff, 0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x2c, 0x00, 0x20, |
|
0x4e, 0x16, 0x75, 0x4c}; |
|
String utf16BE1 = new String(helloWorld16BE, StandardCharsets.UTF_16); |
|
print("UTF-16 BE1", utf16BE1); // => UTF-16 BE1: Hello, 世界 Length 9 |
|
|
|
String utf16BE2 = new String(helloWorld16BE, StandardCharsets.UTF_16BE); |
|
print("UTF-16 BE2", utf16BE2); // => UTF-16 BE2: Hello, 世界 Length 10 |
|
|
|
String utf16LE1 = new String(helloWorld16BE, StandardCharsets.UTF_16LE); |
|
print("UTF-16 LE1", utf16LE1); // => UTF-16 LE1: �䠀攀氀氀漀Ⰰ ᙎ䱵 Length 10 |
|
|
|
byte[] helloWorld16LE = new byte[]{(byte)0xff, (byte)0xfe, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x2c, 0x00, 0x20, 0x00, |
|
0x16, 0x4e, 0x4c, 0x75}; |
|
|
|
String utf16LE2 = new String(helloWorld16LE, StandardCharsets.UTF_16); |
|
print("UTF-16 LE2", utf16LE2); // => UTF-16 LE2: Hello, 世界 Length 9 |
|
|
|
String utf16LE3 = new String(helloWorld16LE, StandardCharsets.UTF_16LE); |
|
print("UTF-16 LE3", utf16LE3); // => UTF-16 LE3: Hello, 世界 Length 10 |
|
|
|
// every char is 2 bytes (16 bit) |
|
System.out.printf("Hello, world is %s\n", "\u0048\u0065\u006c\u006c\u006f\u002c\u0020\u4e16\u754c"); // => Hello, world is Hello, 世界 |
|
System.out.printf("World is %s\n", "世界"); // => World is 世界 |
|
|
|
// surrogates |
|
System.out.println(new String(new byte[] {(byte) 0xd8, 0x00, (byte) 0xdf, 0x0c}, StandardCharsets.UTF_16)); |
|
System.out.println("\ud800\udf02"); |
|
char hs = Character.highSurrogate(0x10302); |
|
char ls = Character.lowSurrogate(0x10302); |
|
System.out.printf("%x %x\n", (int) hs, (int) ls); // => d800 df02 |
|
|
|
// normalization |
|
String accentA = "\u00c1"; |
|
String decomposedAccentA = "\u0041\u0301"; |
|
System.out.printf("%s logical eq %s, but alone \u0041, and accent alone \u0301, but eq? %b\n", accentA, decomposedAccentA, accentA.equals(decomposedAccentA)); // => Á logical eq Á, but alone A, and accent alone ́, but eq? false |
|
|
|
String ffi = "\ufb03"; |
|
String compatibleDecomposedFFI = "\u0066\u0066\u0069"; |
|
System.out.printf("%s logical/semantically eq %s, but eq? %b\n", ffi, compatibleDecomposedFFI, ffi.equals(compatibleDecomposedFFI)); // => ffi logical/semantically eq ffi, but eq? false |
|
System.out.println(Normalizer.normalize(accentA, Normalizer.Form.NFD).length()); // => 2 |
|
System.out.println(Normalizer.normalize(accentA, Normalizer.Form.NFC).length()); // => 1 |
|
|
|
System.out.println(Normalizer.normalize(decomposedAccentA, Normalizer.Form.NFD).length()); // => 2 |
|
System.out.println(Normalizer.normalize(decomposedAccentA, Normalizer.Form.NFC).length()); // => 1 |
|
|
|
System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFKD).length()); // => 3 |
|
System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFKC).length()); // => 3 |
|
|
|
System.out.println(Normalizer.normalize(compatibleDecomposedFFI, Normalizer.Form.NFKD).length()); // => 3 |
|
System.out.println(Normalizer.normalize(compatibleDecomposedFFI, Normalizer.Form.NFKC).length()); // => 3 |
|
|
|
System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFD).length()); // => 1 |
|
System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFC).length()); // => 1 |
|
|
|
System.out.println(Normalizer.normalize(compatibleDecomposedFFI, Normalizer.Form.NFD).length()); // => 3 |
|
System.out.println(Normalizer.normalize(compatibleDecomposedFFI, Normalizer.Form.NFC).length()); // => 3 |
|
|
|
System.out.println(accentA.equals(Normalizer.normalize(decomposedAccentA, Normalizer.Form.NFC))); // => true |
|
System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFKC).equals(Normalizer.normalize(compatibleDecomposedFFI, Normalizer.Form.NFKC))); // => true |
|
System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFKC)); // => ffi |
|
System.out.println(Normalizer.normalize(ffi, Normalizer.Form.NFKD)); // => ffi |
|
} |
|
|
|
private static PrintStream print(String charset, String text) { |
|
return System.out.printf("%-10s: %-12s Length %d\n", charset, text, text.length()); |
|
} |
|
} |