Last active
August 28, 2018 14:03
-
-
Save jtauber/ed07e0fd15ecdc5394755d3e0c9304f8 to your computer and use it in GitHub Desktop.
normalisation code for graves and extra accents
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
VARIA = "\u0300" | |
OXIA = "\u0301" | |
PERISPOMENI = "\u0342" | |
ACCENTS = [VARIA, OXIA, PERISPOMENI] | |
def strip_accents(s): | |
return unicodedata.normalize("NFKC", "".join( | |
c for c in return unicodedata.normalize("NFD", s) if c not in ACCENTS | |
)) | |
def count_accents(s): | |
count = 0 | |
for c in unicodedata.normalize("NFD", s): | |
if c in ACCENTS: | |
count += 1 | |
return count | |
def strip_last_accent(word): | |
x = list(word) | |
for i, ch in enumerate(x[::-1]): | |
s = strip_accents(ch) | |
if s != ch: | |
x[-i - 1] = s | |
break | |
return "".join(x) | |
# change graves to acutes | |
temp = "" | |
for ch in unicodedata.normalize("NFD", norm): | |
if ch == VARIA: | |
ch = OXIA # OXIA will be normalized to TONOS below if needed | |
temp += ch | |
norm = unicodedata.normalize("NFKC", temp) | |
# strip last accent if two | |
if count_accents(norm) == 2: | |
pre_norm = norm | |
norm = strip_last_accent(norm) | |
assert count_accents(norm) == 1, (pre_norm, norm) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment