Skip to content

Instantly share code, notes, and snippets.

@wasi-master
Last active December 30, 2021 05:04
Show Gist options
  • Save wasi-master/c68a8065fd2234b196cfe2a8c1723afc to your computer and use it in GitHub Desktop.
Save wasi-master/c68a8065fd2234b196cfe2a8c1723afc to your computer and use it in GitHub Desktop.
NKFC Normalization mapping
{
' ': ['\xa0', '\u2000', '\u2001', '\u2002', '\u2003', '\u2004', '\u2005', '\u2006', '\u2007', '\u2008', '\u2009', '\u200a', '\u202f', '\u205f', '\u3000'],
'!': ['︕', '﹗', '!'],
'!?': ['⁉'],
'"': ['"'],
'#': ['﹟', '#'],
'$': ['﹩', '$'],
'%': ['﹪', '%'],
'&': ['﹠', '&'],
"'": ['''],
'(': ['⁽', '₍', '︵', '﹙', '('],
')': ['⁾', '₎', '︶', '﹚', ')'],
'*': ['﹡', '*'],
'+': ['⁺', '₊', '﬩', '﹢', '+'],
',': ['︐', '﹐', ','],
'-': ['﹣', '-'],
'.': ['․', '﹒', '.'],
'..': ['‥', '︰'],
'...': ['…', '︙'],
'/': ['/'],
'0': ['⁰', '₀', '⓪', '0'],
'1': ['¹', '₁', '①', '1'],
'1.': ['⒈'],
'10.': ['⒑'],
'11.': ['⒒'],
'12.': ['⒓'],
'13.': ['⒔'],
'14.': ['⒕'],
'15.': ['⒖'],
'16.': ['⒗'],
'17.': ['⒘'],
'18.': ['⒙'],
'19.': ['⒚'],
'2': ['²', '₂', '②', '2'],
'2.': ['⒉'],
'20.': ['⒛'],
'3': ['³', '₃', '③', '3'],
'3.': ['⒊'],
'4': ['⁴', '₄', '④', '4'],
'4.': ['⒋'],
'5': ['⁵', '₅', '⑤', '5'],
'5.': ['⒌'],
'6': ['⁶', '₆', '⑥', '6'],
'6.': ['⒍'],
'7': ['⁷', '₇', '⑦', '7'],
'7.': ['⒎'],
'8': ['⁸', '₈', '⑧', '8'],
'8.': ['⒏'],
'9': ['⁹', '₉', '⑨', '9'],
'9.': ['⒐'],
':': ['︓', '﹕', ':'],
'::=': ['⩴'],
';': [';', '︔', '﹔', ';'],
'<': ['﹤', '<'],
'=': ['⁼', '₌', '﹦', '='],
'==': ['⩵'],
'===': ['⩶'],
'>': ['﹥', '>'],
'?': ['︖', '﹖', '?'],
'?!': ['⁈'],
'??': ['⁇'],
'@': ['﹫', '@'],
'A': ['ᴬ', 'Ⓐ', 'A'],
'B': ['ᴮ', 'ℬ', 'Ⓑ', 'B'],
'C': ['ℂ', 'ℭ', 'Ⅽ', 'Ⓒ', 'C'],
'Co.': ['㏇'],
'D': ['ᴰ', 'ⅅ', 'Ⅾ', 'Ⓓ', 'D'],
'E': ['ᴱ', 'ℰ', 'Ⓔ', 'E'],
'F': ['ℱ', 'Ⓕ', 'F'],
'G': ['ᴳ', 'Ⓖ', 'G'],
'H': ['ᴴ', 'ℋ', 'ℌ', 'ℍ', 'Ⓗ', 'H'],
'I': ['ᴵ', 'ℐ', 'ℑ', 'Ⅰ', 'Ⓘ', 'I'],
'J': ['ᴶ', 'Ⓙ', 'J'],
'K': ['ᴷ', 'K', 'Ⓚ', 'K'],
'L': ['ᴸ', 'ℒ', 'Ⅼ', 'Ⓛ', 'L'],
'M': ['ᴹ', 'ℳ', 'Ⅿ', 'Ⓜ', 'M'],
'N': ['ᴺ', 'ℕ', 'Ⓝ', 'N'],
'O': ['ᴼ', 'Ⓞ', 'O'],
'P': ['ᴾ', 'ℙ', 'Ⓟ', 'P'],
'Q': ['ℚ', 'Ⓠ', 'Q'],
'R': ['ᴿ', 'ℛ', 'ℜ', 'ℝ', 'Ⓡ', 'R'],
'S': ['Ⓢ', 'S'],
'T': ['ᵀ', 'Ⓣ', 'T'],
'U': ['ᵁ', 'Ⓤ', 'U'],
'V': ['Ⅴ', 'Ⓥ', 'ⱽ', 'V'],
'W': ['ᵂ', 'Ⓦ', 'W'],
'X': ['Ⅹ', 'Ⓧ', 'X'],
'Y': ['Ⓨ', 'Y'],
'Z': ['ℤ', 'ℨ', 'Ⓩ', 'Z'],
'[': ['﹇', '['],
'\\': ['﹨', '\'],
']': ['﹈', ']'],
'^': ['^'],
'_': ['︳', '︴', '﹍', '﹎', '﹏', '_'],
'`': ['`', '`'],
'a': ['ª', 'ᵃ', 'ₐ', 'ⓐ', 'a'],
'a.m.': ['㏂'],
'a/c': ['℀'],
'a/s': ['℁'],
'b': ['ᵇ', 'ⓑ', 'b'],
'c': ['ᶜ', 'ⅽ', 'ⓒ', 'c'],
'c/o': ['℅'],
'c/u': ['℆'],
'd': ['ᵈ', 'ⅆ', 'ⅾ', 'ⓓ', 'd'],
'e': ['ᵉ', 'ₑ', 'ℯ', 'ⅇ', 'ⓔ', 'e'],
'f': ['ᶠ', 'ⓕ', 'f'],
'g': ['ᵍ', 'ℊ', 'ⓖ', 'g'],
'h': ['ʰ', 'ℎ', 'ⓗ', 'h'],
'i': ['ᵢ', 'ⁱ', 'ℹ', 'ⅈ', 'ⅰ', 'ⓘ', 'i'],
'j': ['ʲ', 'ⅉ', 'ⓙ', 'ⱼ', 'j'],
'k': ['ᵏ', 'ⓚ', 'k'],
'l': ['ˡ', 'ℓ', 'ⅼ', 'ⓛ', 'l'],
'm': ['ᵐ', 'ⅿ', 'ⓜ', 'm'],
'n': ['ⁿ', 'ⓝ', 'n'],
'o': ['º', 'ᵒ', 'ₒ', 'ℴ', 'ⓞ', 'o'],
'p': ['ᵖ', 'ⓟ', 'p'],
'p.m.': ['㏘'],
'q': ['ⓠ', 'q'],
'r': ['ʳ', 'ᵣ', 'ⓡ', 'r'],
's': ['ſ', 'ˢ', 'ⓢ', 's'],
't': ['ᵗ', 'ⓣ', 't'],
'u': ['ᵘ', 'ᵤ', 'ⓤ', 'u'],
'v': ['ᵛ', 'ᵥ', 'ⅴ', 'ⓥ', 'v'],
'w': ['ʷ', 'ⓦ', 'w'],
'x': ['ˣ', 'ₓ', 'ⅹ', 'ⓧ', 'x'],
'y': ['ʸ', 'ⓨ', 'y'],
'z': ['ᶻ', 'ⓩ', 'z'],
'{': ['︷', '﹛', '{'],
'|': ['|'],
'}': ['︸', '﹜', '}'],
'~': ['~']
}
import requests
import rich
from bs4 import BeautifulSoup
from rich.syntax import Syntax
url = "https://appcheck-ng.com/wp-content/uploads/unicode_normalization.html"
soup = BeautifulSoup(requests.get(url).text, "lxml")
characters = soup.find("table").find_all("tr")[1:]
output = {}
for character in characters:
original = character.find("td").text
alternatives = character.find_all("td")[2:]
for alternative in alternatives:
if not "NFKC" in alternative.text:
print(alternative.text.split(";")[1], '"is not NFKC')
continue
if output.get(original):
output[original].append(alternative.find("span").text)
else:
output[original] = [alternative.find("span").text]
rich.print(output)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment