Skip to content

Instantly share code, notes, and snippets.

@anezih
Created February 20, 2023 18:48
Show Gist options
  • Save anezih/bcf883445c36a1c72202176396c7e020 to your computer and use it in GitHub Desktop.
Save anezih/bcf883445c36a1c72202176396c7e020 to your computer and use it in GitHub Desktop.
zemberek-python ile sözcük üretme (Word Generation)
import pprint
from zemberek.morphology import TurkishMorphology
from zemberek.morphology.lexicon import RootLexicon
from zemberek.morphology.morphotactics.turkish_morphotactics import get_morpheme_map
MORPHOLOGY = TurkishMorphology.create_with_defaults()
def generate_noun(word, morphology):
morpheme_map = get_morpheme_map()
def mm(id):
return morpheme_map.get(id)
# bk. https://docs.google.com/document/d/1bUW6i5KrjkeID8GD7rs8tbGWapZooLmt2gmdAUvPHW8
# https://github.com/ahmetaa/zemberek-nlp/tree/master/morphology#word-generation
# https://github.com/ahmetaa/zemberek-nlp/blob/master/examples/src/main/java/zemberek/examples/morphology/GenerateWords.java
number = [mm("A1sg"), mm("A2sg"), mm("A3sg"), mm("A1pl"), mm("A2pl"), mm("A3pl")]
possessives = [mm("P1sg"), mm("P2sg"), mm("P3sg"), mm("P1pl"), mm("P2pl"), mm("P3pl"), mm("Pnon")]
cases = [mm("Dat"), mm("Loc"), mm("Abl"), mm("Gen"), mm("Acc"), mm("Ins"), mm("Nom")]
derivational = [mm("Rel")] # masa-m-da-`ki`, dün-`kü`
# bk. resources/lexicon.csv
Noun_Lexicon_Suffix = ["Noun", "Noun_1", "Noun_2", "Noun_Time", "Noun_Time_1",
"Adj_Prop", "Adj", "Adv_Prop", "Adv_Time", "Pron", "Pron_Demons",
"Pron_Demons_1", "Pron_Pers", "Pron_Quant", "Pron_Ques", "Pron_Reflex"]
# morphology = TurkishMorphology.create_with_defaults()
lex = RootLexicon.get_default()
# simple_id = {k.split("_")[0]:k for k in lex.id_map.keys()}
# word_item = lex.get_item_by_id(f"{word}_Noun_2")
noun_res = {}
words = []
for nls in Noun_Lexicon_Suffix:
word_nls = f"{word}_{nls}"
if (w := lex.get_item_by_id(word_nls)):
words.append((w, word_nls))
for w in words:
temp = []
for number_m in number:
for possessive_m in possessives:
for case_m in cases:
temp += morphology.word_generator.generate(w[0], (number_m, possessive_m, case_m))
for number_m in number:
for possessive_m in possessives:
for case_m in cases:
for deriv_m in derivational:
temp += morphology.word_generator.generate(w[0], (number_m, possessive_m, case_m, deriv_m))
noun_res[w[1]] = [x.surface for x in temp]
return noun_res
def generate_verb(verb, morphology):
morpheme_map = get_morpheme_map()
def mm(id):
return morpheme_map.get(id)
positiveNegatives = [mm("Verb"), mm("Neg")] # mm("Verb") -> positive yerine
times = [mm("Imp"), mm("Aor"), mm("Past"), mm("Prog1"), mm("Prog2"), mm("Narr"), mm("Fut"), mm("Neces"), mm("Opt"), mm("Cond")] # cond bazen çalışmıyor
persons = [mm("A1sg"), mm("A2sg"), mm("A3sg"), mm("A1pl"), mm("A2pl"), mm("A3pl")]
Verb_Lexicon_Suffix = "Verb"
# morphology = TurkishMorphology.create_with_defaults()
lex = RootLexicon.get_default()
# simple_id = {k.split("_")[0]:k for k in lex.id_map.keys()}
word_item = lex.get_item_by_id(f"{verb}_{Verb_Lexicon_Suffix}")
if not word_item:
return []
res = []
for positiveNegative_m in positiveNegatives:
for time_m in times:
for person_m in persons:
res += morphology.word_generator.generate(word_item, (positiveNegative_m, time_m, person_m))
result = [x.surface for x in res]
return result
def noun_verb_other(word, morphology):
# morphology = TurkishMorphology.create_with_defaults()
result = morphology.analyze(word)
analysis = [(x.get_ending(), x.get_stem()) for x in result]
return analysis
pp = pprint.PrettyPrinter(indent=2)
res = generate_noun("bilinç", MORPHOLOGY)
pp.pprint(res)
res_verb = generate_verb("savaşmak", MORPHOLOGY)
pp.pprint(res_verb)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment