Skip to content

Instantly share code, notes, and snippets.

@fergusq
Last active January 22, 2024 19:02
Show Gist options
  • Save fergusq/08f03ab9a4f1c3c56825c2474d4dfc1b to your computer and use it in GitHub Desktop.
Save fergusq/08f03ab9a4f1c3c56825c2474d4dfc1b to your computer and use it in GitHub Desktop.
A small module for replacing cardinal numerals written using number symbols with numerals written using letters in Finnish text. Dependencies: taivutin, stanza
import taivutin
import stanza
import re
nlp = stanza.Pipeline("fi")
tai = taivutin.Taivutin()
case_stanza2taivutin = {
"Nom": "nom",
"Gen": "gen",
"Par": "part",
"Ess": "ess",
"Ine": "iness",
"Ela": "elat",
"Ill": "ill",
"Ade": "ade",
"Abl": "abl",
"All": "all",
"Tra": "trans",
"Ins": "ins",
"Abe": "abe",
"Com": "kom",
}
number_table = {
1: "yksi",
2: "kaksi",
3: "kolme",
4: "neljä",
5: "viisi",
6: "kuusi",
7: "seitsemän",
8: "kahdeksan",
9: "yhdeksän",
}
number_table_toista = {
11: "yksitoista",
12: "kaksitoista",
13: "kolmetoista",
14: "neljätoista",
15: "viisitoista",
16: "kuusitoista",
17: "seitsemäntoista",
18: "kahdeksantoista",
19: "yhdeksäntoista",
}
number_table_small_mags = {
100: "sata",
10: "kymmenen",
}
number_table_large_mags = {
1_000_000_000_000: "biljoona",
1_000_000_000: "miljardi",
1_000_000: "miljoona",
1_000: "tuhat",
}
def inflect(num: str, case: str):
num = num.replace(" ", "")
num = int(num)
ans = ""
assert num < 1_000_000_000_000_000, num
for mag, magword in number_table_large_mags.items():
count = num // mag
if count > 0:
if count != 1:
ans += inflect_small_num(count, case)
ans += tai.taivuta(magword, case if case != "nom" else "part" if count > 1 else "nom")
num -= count*mag
ans += inflect_small_num(num, case)
return ans
def inflect_small_num(num: int, case: str):
ans = ""
toista = False
assert num < 1_000, num
for mag, magword in number_table_small_mags.items():
count = num // mag
if count > 0:
if 11 <= num <= 19:
toista = True
ans += inflect_single_num(num-10, case) + "toista"
else:
if count != 1:
ans += inflect_single_num(count, case)
ans += tai.taivuta(magword, case if case != "nom" else "part" if count > 1 else "nom")
num -= count*mag
if not toista:
ans += inflect_single_num(num, case)
return ans
def inflect_single_num(num: int, case: str):
assert num < 10
if num == 0:
return ""
else:
ans = tai.taivuta(number_table[num], case)
# purkkaa koska taivutin ei osaa
if (num == 5 or num == 5) and case != "nom" and (" mon" not in case or case == "nom mon"):
ans = ans.replace("si", "de")
return ans
def parse_feats(feats: str | None):
if not feats:
return {}
return {a: b for [a, b] in (f.split("=") for f in feats.split("|"))}
def process_texts(texts: list[str]):
docs = [stanza.Document([], text=text) for text in texts]
docs = nlp(docs)
for text, doc in zip(texts, docs):
substitutions = []
for sent in doc.sentences:
for word in sent.words:
feats = parse_feats(word.feats)
if re.fullmatch(r"[\d ]+", word.lemma) and feats.get("NumType", None) == "Card":
if word.deprel == "nummod" and word.head > word.id:
head = sent.words[word.head - 1]
if head.upos == "NOUN":
head_feats = parse_feats(head.feats)
#print("h", head_feats)
case = head_feats.get("Case", "Nom")
case = case_stanza2taivutin.get(case, "nom")
if case == "part" and feats.get("Case", "Nom") != "Par":
case = "nom"
if head_feats.get("Number", "Sing") == "Plur":
case += " mon"
substitutions.append((word.start_char, word.end_char, inflect(word.lemma, case)))
else:
case = "nom"
substitutions.append((word.start_char, word.end_char, inflect(word.lemma, case)))
if not substitutions:
yield text
else:
ans = ""
pos = 0
for start, end, word in substitutions:
ans += text[pos:start]
ans += word
pos = end
ans += text[pos:]
yield ans
if __name__ == "__main__":
while True:
text = input()
for s in process_texts([text]):
print(s)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment