Last active
January 22, 2024 19:02
-
-
Save fergusq/08f03ab9a4f1c3c56825c2474d4dfc1b to your computer and use it in GitHub Desktop.
A small module for replacing cardinal numerals written using number symbols with numerals written using letters in Finnish text. Dependencies: taivutin, stanza
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import taivutin | |
import stanza | |
import re | |
nlp = stanza.Pipeline("fi") | |
tai = taivutin.Taivutin() | |
case_stanza2taivutin = { | |
"Nom": "nom", | |
"Gen": "gen", | |
"Par": "part", | |
"Ess": "ess", | |
"Ine": "iness", | |
"Ela": "elat", | |
"Ill": "ill", | |
"Ade": "ade", | |
"Abl": "abl", | |
"All": "all", | |
"Tra": "trans", | |
"Ins": "ins", | |
"Abe": "abe", | |
"Com": "kom", | |
} | |
number_table = { | |
1: "yksi", | |
2: "kaksi", | |
3: "kolme", | |
4: "neljä", | |
5: "viisi", | |
6: "kuusi", | |
7: "seitsemän", | |
8: "kahdeksan", | |
9: "yhdeksän", | |
} | |
number_table_toista = { | |
11: "yksitoista", | |
12: "kaksitoista", | |
13: "kolmetoista", | |
14: "neljätoista", | |
15: "viisitoista", | |
16: "kuusitoista", | |
17: "seitsemäntoista", | |
18: "kahdeksantoista", | |
19: "yhdeksäntoista", | |
} | |
number_table_small_mags = { | |
100: "sata", | |
10: "kymmenen", | |
} | |
number_table_large_mags = { | |
1_000_000_000_000: "biljoona", | |
1_000_000_000: "miljardi", | |
1_000_000: "miljoona", | |
1_000: "tuhat", | |
} | |
def inflect(num: str, case: str): | |
num = num.replace(" ", "") | |
num = int(num) | |
ans = "" | |
assert num < 1_000_000_000_000_000, num | |
for mag, magword in number_table_large_mags.items(): | |
count = num // mag | |
if count > 0: | |
if count != 1: | |
ans += inflect_small_num(count, case) | |
ans += tai.taivuta(magword, case if case != "nom" else "part" if count > 1 else "nom") | |
num -= count*mag | |
ans += inflect_small_num(num, case) | |
return ans | |
def inflect_small_num(num: int, case: str): | |
ans = "" | |
toista = False | |
assert num < 1_000, num | |
for mag, magword in number_table_small_mags.items(): | |
count = num // mag | |
if count > 0: | |
if 11 <= num <= 19: | |
toista = True | |
ans += inflect_single_num(num-10, case) + "toista" | |
else: | |
if count != 1: | |
ans += inflect_single_num(count, case) | |
ans += tai.taivuta(magword, case if case != "nom" else "part" if count > 1 else "nom") | |
num -= count*mag | |
if not toista: | |
ans += inflect_single_num(num, case) | |
return ans | |
def inflect_single_num(num: int, case: str): | |
assert num < 10 | |
if num == 0: | |
return "" | |
else: | |
ans = tai.taivuta(number_table[num], case) | |
# purkkaa koska taivutin ei osaa | |
if (num == 5 or num == 5) and case != "nom" and (" mon" not in case or case == "nom mon"): | |
ans = ans.replace("si", "de") | |
return ans | |
def parse_feats(feats: str | None): | |
if not feats: | |
return {} | |
return {a: b for [a, b] in (f.split("=") for f in feats.split("|"))} | |
def process_texts(texts: list[str]): | |
docs = [stanza.Document([], text=text) for text in texts] | |
docs = nlp(docs) | |
for text, doc in zip(texts, docs): | |
substitutions = [] | |
for sent in doc.sentences: | |
for word in sent.words: | |
feats = parse_feats(word.feats) | |
if re.fullmatch(r"[\d ]+", word.lemma) and feats.get("NumType", None) == "Card": | |
if word.deprel == "nummod" and word.head > word.id: | |
head = sent.words[word.head - 1] | |
if head.upos == "NOUN": | |
head_feats = parse_feats(head.feats) | |
#print("h", head_feats) | |
case = head_feats.get("Case", "Nom") | |
case = case_stanza2taivutin.get(case, "nom") | |
if case == "part" and feats.get("Case", "Nom") != "Par": | |
case = "nom" | |
if head_feats.get("Number", "Sing") == "Plur": | |
case += " mon" | |
substitutions.append((word.start_char, word.end_char, inflect(word.lemma, case))) | |
else: | |
case = "nom" | |
substitutions.append((word.start_char, word.end_char, inflect(word.lemma, case))) | |
if not substitutions: | |
yield text | |
else: | |
ans = "" | |
pos = 0 | |
for start, end, word in substitutions: | |
ans += text[pos:start] | |
ans += word | |
pos = end | |
ans += text[pos:] | |
yield ans | |
if __name__ == "__main__": | |
while True: | |
text = input() | |
for s in process_texts([text]): | |
print(s) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment