Skip to content

Instantly share code, notes, and snippets.

@hucsmn
Created January 20, 2019 10:54
Show Gist options
  • Save hucsmn/1466df6828e72b77d07ff791218aba90 to your computer and use it in GitHub Desktop.
Save hucsmn/1466df6828e72b77d07ff791218aba90 to your computer and use it in GitHub Desktop.
Unicode case folding pitfall
'µ' (00b5): upper 'Μ' (039c) [ulu: 'Μ' (039c)], lower 'µ' (00b5) [lul: 'μ' (03bc)]
'İ' (0130): upper 'İ' (0130) [ulu: 'I' (0049)], lower 'i' (0069) [lul: 'i' (0069)]
'ı' (0131): upper 'I' (0049) [ulu: 'I' (0049)], lower 'ı' (0131) [lul: 'i' (0069)]
'ſ' (017f): upper 'S' (0053) [ulu: 'S' (0053)], lower 'ſ' (017f) [lul: 's' (0073)]
'ς' (03c2): upper 'Σ' (03a3) [ulu: 'Σ' (03a3)], lower 'ς' (03c2) [lul: 'σ' (03c3)]
'ϐ' (03d0): upper 'Β' (0392) [ulu: 'Β' (0392)], lower 'ϐ' (03d0) [lul: 'β' (03b2)]
'ϑ' (03d1): upper 'Θ' (0398) [ulu: 'Θ' (0398)], lower 'ϑ' (03d1) [lul: 'θ' (03b8)]
'ϕ' (03d5): upper 'Φ' (03a6) [ulu: 'Φ' (03a6)], lower 'ϕ' (03d5) [lul: 'φ' (03c6)]
'ϖ' (03d6): upper 'Π' (03a0) [ulu: 'Π' (03a0)], lower 'ϖ' (03d6) [lul: 'π' (03c0)]
'ϰ' (03f0): upper 'Κ' (039a) [ulu: 'Κ' (039a)], lower 'ϰ' (03f0) [lul: 'κ' (03ba)]
'ϱ' (03f1): upper 'Ρ' (03a1) [ulu: 'Ρ' (03a1)], lower 'ϱ' (03f1) [lul: 'ρ' (03c1)]
'ϴ' (03f4): upper 'ϴ' (03f4) [ulu: 'Θ' (0398)], lower 'θ' (03b8) [lul: 'θ' (03b8)]
'ϵ' (03f5): upper 'Ε' (0395) [ulu: 'Ε' (0395)], lower 'ϵ' (03f5) [lul: 'ε' (03b5)]
'ᲀ' (1c80): upper 'В' (0412) [ulu: 'В' (0412)], lower 'ᲀ' (1c80) [lul: 'в' (0432)]
'ᲁ' (1c81): upper 'Д' (0414) [ulu: 'Д' (0414)], lower 'ᲁ' (1c81) [lul: 'д' (0434)]
'ᲂ' (1c82): upper 'О' (041e) [ulu: 'О' (041e)], lower 'ᲂ' (1c82) [lul: 'о' (043e)]
'ᲃ' (1c83): upper 'С' (0421) [ulu: 'С' (0421)], lower 'ᲃ' (1c83) [lul: 'с' (0441)]
'ᲄ' (1c84): upper 'Т' (0422) [ulu: 'Т' (0422)], lower 'ᲄ' (1c84) [lul: 'т' (0442)]
'ᲅ' (1c85): upper 'Т' (0422) [ulu: 'Т' (0422)], lower 'ᲅ' (1c85) [lul: 'т' (0442)]
'ᲆ' (1c86): upper 'Ъ' (042a) [ulu: 'Ъ' (042a)], lower 'ᲆ' (1c86) [lul: 'ъ' (044a)]
'ᲇ' (1c87): upper 'Ѣ' (0462) [ulu: 'Ѣ' (0462)], lower 'ᲇ' (1c87) [lul: 'ѣ' (0463)]
'ᲈ' (1c88): upper 'Ꙋ' (a64a) [ulu: 'Ꙋ' (a64a)], lower 'ᲈ' (1c88) [lul: 'ꙋ' (a64b)]
'ẛ' (1e9b): upper 'Ṡ' (1e60) [ulu: 'Ṡ' (1e60)], lower 'ẛ' (1e9b) [lul: 'ṡ' (1e61)]
'ẞ' (1e9e): upper 'ẞ' (1e9e) [ulu: 'ß' (00df)], lower 'ß' (00df) [lul: 'ß' (00df)]
'ι' (1fbe): upper 'Ι' (0399) [ulu: 'Ι' (0399)], lower 'ι' (1fbe) [lul: 'ι' (03b9)]
'Ω' (2126): upper 'Ω' (2126) [ulu: 'Ω' (03a9)], lower 'ω' (03c9) [lul: 'ω' (03c9)]
'K' (212a): upper 'K' (212a) [ulu: 'K' (004b)], lower 'k' (006b) [lul: 'k' (006b)]
'Å' (212b): upper 'Å' (212b) [ulu: 'Å' (00c5)], lower 'å' (00e5) [lul: 'å' (00e5)]
orig: "µİıſςϐϑϕϖϰϱϴϵᲀᲁᲂᲃᲄᲅᲆᲇᲈẛẞιΩKÅ" ([ 00b5 0130 0131 017f 03c2 03d0 03d1 03d5 03d6 03f0 03f1 03f4 03f5 1c80 1c81 1c82 1c83 1c84 1c85 1c86 1c87 1c88 1e9b 1e9e 1fbe 2126 212a 212b])
nfkc: "μİısςβθφπκρΘεᲀᲁᲂᲃᲄᲅᲆᲇᲈṡẞιΩKÅ" ([ 03bc 0130 0131 0073 03c2 03b2 03b8 03c6 03c0 03ba 03c1 0398 03b5 1c80 1c81 1c82 1c83 1c84 1c85 1c86 1c87 1c88 1e61 1e9e 03b9 03a9 004b 00c5])
nfkd: "μİısςβθφπκρΘεᲀᲁᲂᲃᲄᲅᲆᲇᲈṡẞιΩKÅ" ([ 03bc 0049 0307 0131 0073 03c2 03b2 03b8 03c6 03c0 03ba 03c1 0398 03b5 1c80 1c81 1c82 1c83 1c84 1c85 1c86 1c87 1c88 0073 0307 1e9e 03b9 03a9 004b 0041 030a])
nfkc == nfkd: false
toupper(nfkc) <=> toupper(tolower(toupper(nfkc)))
$L: "ΜİISΣΒΘΦΠΚΡΘΕВДОСТТЪѢꙊṠẞΙΩKÅ" ([ 039c 0130 0049 0053 03a3 0392 0398 03a6 03a0 039a 03a1 0398 0395 0412 0414 041e 0421 0422 0422 042a 0462 a64a 1e60 1e9e 0399 03a9 004b 00c5])
$R: "ΜIISΣΒΘΦΠΚΡΘΕВДОСТТЪѢꙊṠßΙΩKÅ" ([ 039c 0049 0049 0053 03a3 0392 0398 03a6 03a0 039a 03a1 0398 0395 0412 0414 041e 0421 0422 0422 042a 0462 a64a 1e60 00df 0399 03a9 004b 00c5])
$L == $R: false
tolower(nfkc) <=> tolower(toupper(tolower(nfkc)))
$L: "μiısςβθφπκρθεᲀᲁᲂᲃᲄᲅᲆᲇᲈṡßιωkå" ([ 03bc 0069 0131 0073 03c2 03b2 03b8 03c6 03c0 03ba 03c1 03b8 03b5 1c80 1c81 1c82 1c83 1c84 1c85 1c86 1c87 1c88 1e61 00df 03b9 03c9 006b 00e5])
$R: "μiisσβθφπκρθεвдосттъѣꙋṡßιωkå" ([ 03bc 0069 0069 0073 03c3 03b2 03b8 03c6 03c0 03ba 03c1 03b8 03b5 0432 0434 043e 0441 0442 0442 044a 0463 a64b 1e61 00df 03b9 03c9 006b 00e5])
$L == $R: false
toupper(nfkd) <=> toupper(tolower(toupper(nfkd)))
$L: "ΜİISΣΒΘΦΠΚΡΘΕВДОСТТЪѢꙊṠẞΙΩKÅ" ([ 039c 0049 0307 0049 0053 03a3 0392 0398 03a6 03a0 039a 03a1 0398 0395 0412 0414 041e 0421 0422 0422 042a 0462 a64a 0053 0307 1e9e 0399 03a9 004b 0041 030a])
$R: "ΜİISΣΒΘΦΠΚΡΘΕВДОСТТЪѢꙊṠßΙΩKÅ" ([ 039c 0049 0307 0049 0053 03a3 0392 0398 03a6 03a0 039a 03a1 0398 0395 0412 0414 041e 0421 0422 0422 042a 0462 a64a 0053 0307 00df 0399 03a9 004b 0041 030a])
$L == $R: false
tolower(nfkd) <=> tolower(toupper(tolower(nfkd)))
$L: "μi̇ısςβθφπκρθεᲀᲁᲂᲃᲄᲅᲆᲇᲈṡßιωkå" ([ 03bc 0069 0307 0131 0073 03c2 03b2 03b8 03c6 03c0 03ba 03c1 03b8 03b5 1c80 1c81 1c82 1c83 1c84 1c85 1c86 1c87 1c88 0073 0307 00df 03b9 03c9 006b 0061 030a])
$R: "μi̇isσβθφπκρθεвдосттъѣꙋṡßιωkå" ([ 03bc 0069 0307 0069 0073 03c3 03b2 03b8 03c6 03c0 03ba 03c1 03b8 03b5 0432 0434 043e 0441 0442 0442 044a 0463 a64b 0073 0307 00df 03b9 03c9 006b 0061 030a])
$L == $R: false
package main
import (
"fmt"
"strings"
"unicode"
"golang.org/x/text/unicode/norm"
)
func main() {
// search all the letter r that ToUpper(r) != ToUpper(ToLower(ToUpper(r))) || ToLower(r) != ToLower(ToUpper(ToLower(r)))
var r rune
var rs []rune
for r = 0; r < unicode.MaxRune; r++ {
if unicode.IsLetter(r) {
u := unicode.ToUpper(r)
l := unicode.ToLower(r)
ulu := unicode.ToUpper(unicode.ToLower(u))
lul := unicode.ToLower(unicode.ToUpper(l))
if u != ulu || l != lul {
fmt.Printf("%q (%.4x): upper %q (%.4x) [ulu: %q (%.4x)], lower %q (%.4x) [lul: %q (%.4x)]\n", r, r, u, u, ulu, ulu, l, l, lul, lul)
rs = append(rs, r)
}
}
}
fmt.Println("")
// test if normalization works
orig := string(rs)
nfkc := norm.NFKC.String(orig)
nfkd := norm.NFKD.String(orig)
fmt.Printf("orig: %q (% .4x)\n", orig, []rune(orig))
fmt.Printf("nfkc: %q (% .4x)\n", nfkc, []rune(nfkc))
fmt.Printf("nfkd: %q (% .4x)\n", nfkd, []rune(nfkd))
fmt.Println("nfkc == nfkd:", nfkc == nfkd)
fmt.Println("")
nfkc_u := strings.ToUpper(nfkc)
nfkc_l := strings.ToLower(nfkc)
nfkc_ulu := strings.ToUpper(strings.ToLower(strings.ToUpper(nfkc)))
nfkc_lul := strings.ToLower(strings.ToUpper(strings.ToLower(nfkc)))
nfkd_u := strings.ToUpper(nfkd)
nfkd_l := strings.ToLower(nfkd)
nfkd_ulu := strings.ToUpper(strings.ToLower(strings.ToUpper(nfkd)))
nfkd_lul := strings.ToLower(strings.ToUpper(strings.ToLower(nfkd)))
fmt.Println("toupper(nfkc) <=> toupper(tolower(toupper(nfkc)))")
fmt.Printf("$L: %q (% .4x)\n", nfkc_u, []rune(nfkc_u))
fmt.Printf("$R: %q (% .4x)\n", nfkc_ulu, []rune(nfkc_ulu))
fmt.Println("$L == $R:", nfkc_u == nfkc_ulu)
fmt.Println("")
fmt.Println("tolower(nfkc) <=> tolower(toupper(tolower(nfkc)))")
fmt.Printf("$L: %q (% .4x)\n", nfkc_l, []rune(nfkc_l))
fmt.Printf("$R: %q (% .4x)\n", nfkc_lul, []rune(nfkc_lul))
fmt.Println("$L == $R:", nfkc_l == nfkc_lul)
fmt.Println("")
fmt.Println("toupper(nfkd) <=> toupper(tolower(toupper(nfkd)))")
fmt.Printf("$L: %q (% .4x)\n", nfkd_u, []rune(nfkd_u))
fmt.Printf("$R: %q (% .4x)\n", nfkd_ulu, []rune(nfkd_ulu))
fmt.Println("$L == $R:", nfkd_u == nfkd_ulu)
fmt.Println("")
fmt.Println("tolower(nfkd) <=> tolower(toupper(tolower(nfkd)))")
fmt.Printf("$L: %q (% .4x)\n", nfkd_l, []rune(nfkd_l))
fmt.Printf("$R: %q (% .4x)\n", nfkd_lul, []rune(nfkd_lul))
fmt.Println("$L == $R:", nfkd_l == nfkd_lul)
fmt.Println("")
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment