NFC, NFD, NFKC, NFKD
input:
it’säå(1−2)ドブロク㍿
result:
NFC : it’säå(1−2)ドブロク㍿ (45 bytes)
NFD : it’säå(1−2)ドブロク㍿ (50 bytes)
NFKC: it’säå(1−2)ドブロク株式会社 (41 bytes)
NFKD: it’säå(1−2)ドブロク株式会社 (49 bytes)
import unicodedata
def conv_and_print(form):
src = "it’säå(1−2)ドブロク㍿"
norm = unicodedata.normalize(form, src)
print(f"{form}: {norm} ({len(norm.encode('utf-8'))} bytes)")
conv_and_print("NFC")
conv_and_print("NFD")
conv_and_print("NFKC")
conv_and_print("NFKD")
You can use the golang.org/x/text/unicode/norm package.
package main
import (
"fmt"
"golang.org/x/text/unicode/norm"
)
func main() {
src := "it’säå(1−2)ドブロク㍿"
forms := map[string]norm.Form{"NFC": norm.NFC, "NFD": norm.NFD, "NFKC": norm.NFKC, "NFKD": norm.NFKD}
for name, form := range forms {
norm := form.String(src)
fmt.Printf("%s: %v (%v bytes)\n", name, norm, len(norm))
}
}
You can use the unicode-normalization crate.
fn main() {
use unicode_normalization::UnicodeNormalization;
let s = "it’säå(1−2)ドブロク㍿";
let print = |form, norm: &str| println!("{}: {} ({} bytes)", form, norm, norm.len());
print("NFC", &s.nfc().collect::<String>());
print("NFD", &s.nfd().collect::<String>());
print("NFKC", &s.nfkc().collect::<String>());
print("NFKD", &s.nfkd().collect::<String>());
}