fergusq/finnish_num_infl.py

## finnish_num_infl.py
import taivutin
import stanza
import re

nlp = stanza.Pipeline("fi")
tai = taivutin.Taivutin()

case_stanza2taivutin = {
	"Nom": "nom",
	"Gen": "gen",
	"Par": "part",
	"Ess": "ess",
	"Ine": "iness",
	"Ela": "elat",
	"Ill": "ill",
	"Ade": "ade",
	"Abl": "abl",
	"All": "all",
	"Tra": "trans",
	"Ins": "ins",
	"Abe": "abe",
	"Com": "kom",
}

number_table = {
	1: "yksi",
	2: "kaksi",
	3: "kolme",
	4: "neljä",
	5: "viisi",
	6: "kuusi",
	7: "seitsemän",
	8: "kahdeksan",
	9: "yhdeksän",
}

number_table_toista = {
	11: "yksitoista",
	12: "kaksitoista",
	13: "kolmetoista",
	14: "neljätoista",
	15: "viisitoista",
	16: "kuusitoista",
	17: "seitsemäntoista",
	18: "kahdeksantoista",
	19: "yhdeksäntoista",
}

number_table_small_mags = {
	100: "sata",
	10: "kymmenen",
}

number_table_large_mags = {
	1_000_000_000_000: "biljoona",
	1_000_000_000: "miljardi",
	1_000_000: "miljoona",
	1_000: "tuhat",
}

def inflect(num: str, case: str):
	num = num.replace(" ", "")
	num = int(num)
	ans = ""
	assert num < 1_000_000_000_000_000, num
	for mag, magword in number_table_large_mags.items():
		count = num // mag
		if count > 0:
			if count != 1:
				ans += inflect_small_num(count, case)
			ans += tai.taivuta(magword, case if case != "nom" else "part" if count > 1 else "nom")
			num -= count*mag

	ans += inflect_small_num(num, case)
	return ans

def inflect_small_num(num: int, case: str):
	ans = ""
	toista = False
	assert num < 1_000, num
	for mag, magword in number_table_small_mags.items():
		count = num // mag
		if count > 0:
			if 11 <= num <= 19:
				toista = True
				ans += inflect_single_num(num-10, case) + "toista"
			else:
				if count != 1:
					ans += inflect_single_num(count, case)
				ans += tai.taivuta(magword, case if case != "nom" else "part" if count > 1 else "nom")
			num -= count*mag

	if not toista:
		ans += inflect_single_num(num, case)
	return ans

def inflect_single_num(num: int, case: str):
	assert num < 10
	if num == 0:
		return ""

	else:
		ans = tai.taivuta(number_table[num], case)
		# purkkaa koska taivutin ei osaa
		if (num == 5 or num == 5) and case != "nom" and (" mon" not in case or case == "nom mon"):
			ans = ans.replace("si", "de")

		return ans

def parse_feats(feats: str | None):
	if not feats:
		return {}
	return {a: b for [a, b] in (f.split("=") for f in feats.split("|"))}

def process_texts(texts: list[str]):
	docs = [stanza.Document([], text=text) for text in texts]
	docs = nlp(docs)
	for text, doc in zip(texts, docs):
		substitutions = []
		for sent in doc.sentences:
			for word in sent.words:
				feats = parse_feats(word.feats)
				if re.fullmatch(r"[\d ]+", word.lemma) and feats.get("NumType", None) == "Card":
					if word.deprel == "nummod" and word.head > word.id:
						head = sent.words[word.head - 1]
						if head.upos == "NOUN":
							head_feats = parse_feats(head.feats)
							#print("h", head_feats)
							case = head_feats.get("Case", "Nom")
							case = case_stanza2taivutin.get(case, "nom")
							if case == "part" and feats.get("Case", "Nom") != "Par":
								case = "nom"
							if head_feats.get("Number", "Sing") == "Plur":
								case += " mon"

							substitutions.append((word.start_char, word.end_char, inflect(word.lemma, case)))
					else:
						case = "nom"
						substitutions.append((word.start_char, word.end_char, inflect(word.lemma, case)))
		if not substitutions:
			yield text

		else:
			ans = ""
			pos = 0
			for start, end, word in substitutions:
				ans += text[pos:start]
				ans += word
				pos = end

			ans += text[pos:]
			yield ans

if __name__ == "__main__":
	while True:
		text = input()
		for s in process_texts([text]):
			print(s)
	import taivutin
	import stanza
	import re

	nlp = stanza.Pipeline("fi")
	tai = taivutin.Taivutin()

	case_stanza2taivutin = {
	"Nom": "nom",
	"Gen": "gen",
	"Par": "part",
	"Ess": "ess",
	"Ine": "iness",
	"Ela": "elat",
	"Ill": "ill",
	"Ade": "ade",
	"Abl": "abl",
	"All": "all",
	"Tra": "trans",
	"Ins": "ins",
	"Abe": "abe",
	"Com": "kom",
	}

	number_table = {
	1: "yksi",
	2: "kaksi",
	3: "kolme",
	4: "neljä",
	5: "viisi",
	6: "kuusi",
	7: "seitsemän",
	8: "kahdeksan",
	9: "yhdeksän",
	}

	number_table_toista = {
	11: "yksitoista",
	12: "kaksitoista",
	13: "kolmetoista",
	14: "neljätoista",
	15: "viisitoista",
	16: "kuusitoista",
	17: "seitsemäntoista",
	18: "kahdeksantoista",
	19: "yhdeksäntoista",
	}

	number_table_small_mags = {
	100: "sata",
	10: "kymmenen",
	}

	number_table_large_mags = {
	1_000_000_000_000: "biljoona",
	1_000_000_000: "miljardi",
	1_000_000: "miljoona",
	1_000: "tuhat",
	}

	def inflect(num: str, case: str):
	num = num.replace(" ", "")
	num = int(num)
	ans = ""
	assert num < 1_000_000_000_000_000, num
	for mag, magword in number_table_large_mags.items():
	count = num // mag
	if count > 0:
	if count != 1:
	ans += inflect_small_num(count, case)
	ans += tai.taivuta(magword, case if case != "nom" else "part" if count > 1 else "nom")
	num -= count*mag

	ans += inflect_small_num(num, case)
	return ans

	def inflect_small_num(num: int, case: str):
	ans = ""
	toista = False
	assert num < 1_000, num
	for mag, magword in number_table_small_mags.items():
	count = num // mag
	if count > 0:
	if 11 <= num <= 19:
	toista = True
	ans += inflect_single_num(num-10, case) + "toista"
	else:
	if count != 1:
	ans += inflect_single_num(count, case)
	ans += tai.taivuta(magword, case if case != "nom" else "part" if count > 1 else "nom")
	num -= count*mag

	if not toista:
	ans += inflect_single_num(num, case)
	return ans

	def inflect_single_num(num: int, case: str):
	assert num < 10
	if num == 0:
	return ""

	else:
	ans = tai.taivuta(number_table[num], case)
	# purkkaa koska taivutin ei osaa
	if (num == 5 or num == 5) and case != "nom" and (" mon" not in case or case == "nom mon"):
	ans = ans.replace("si", "de")

	return ans

	def parse_feats(feats: str \| None):
	if not feats:
	return {}
	return {a: b for [a, b] in (f.split("=") for f in feats.split("\|"))}

	def process_texts(texts: list[str]):
	docs = [stanza.Document([], text=text) for text in texts]
	docs = nlp(docs)
	for text, doc in zip(texts, docs):
	substitutions = []
	for sent in doc.sentences:
	for word in sent.words:
	feats = parse_feats(word.feats)
	if re.fullmatch(r"[\d ]+", word.lemma) and feats.get("NumType", None) == "Card":
	if word.deprel == "nummod" and word.head > word.id:
	head = sent.words[word.head - 1]
	if head.upos == "NOUN":
	head_feats = parse_feats(head.feats)
	#print("h", head_feats)
	case = head_feats.get("Case", "Nom")
	case = case_stanza2taivutin.get(case, "nom")
	if case == "part" and feats.get("Case", "Nom") != "Par":
	case = "nom"
	if head_feats.get("Number", "Sing") == "Plur":
	case += " mon"

	substitutions.append((word.start_char, word.end_char, inflect(word.lemma, case)))
	else:
	case = "nom"
	substitutions.append((word.start_char, word.end_char, inflect(word.lemma, case)))
	if not substitutions:
	yield text

	else:
	ans = ""
	pos = 0
	for start, end, word in substitutions:
	ans += text[pos:start]
	ans += word
	pos = end

	ans += text[pos:]
	yield ans

	if __name__ == "__main__":
	while True:
	text = input()
	for s in process_texts([text]):
	print(s)