Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Script to quickly compare performance of seven different stemmers and lemmatizers focussed on German language
import json
import re
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
import spacy
from HanTa import HanoverTagger as ht
from nltk.stem import WordNetLemmatizer
from spacy_iwnlp import spaCyIWNLP
words_file = open('total_words.json', 'r')
words = json.load(words_file)
snowball = SnowballStemmer("german", ignore_stopwords=True)
porter = PorterStemmer()
lancaster = LancasterStemmer()
hannover = ht.HanoverTagger('morphmodel_ger.pgz')
wordnet = WordNetLemmatizer()
spc = spacy.load(r'/usr/local/lib/python3.9/site-packages/de_core_news_md/de_core_news_md-2.3.0', disable=['parser', 'ner'])
iwnlp = spc
iwnlp_pipe = spaCyIWNLP(lemmatizer_path='/Users/user1/Downloads/IWNLP.Lemmatizer_20181001.json')
iwnlp.add_pipe(iwnlp_pipe)
clean_text_expression = '[\(\):\[\]\{\}\r\n\/\<\>\%\&\;\_]'
print("{0:20}{1:20}{2:20}{3:20}{4:20}{5:20}{6:20}{7:20}".format("Wort","Porter","Lancaster", "Snowball", "Wordnet", "Hannover", "Spacy", "IWNLP"))
for word in words:
cleaned_word = re.sub(clean_text_expression, ' ', word)
print("{0:20}{1:20}{2:20}{3:20}{4:20}{5:20}{6:20}{7:20}".format(
each_word,
porter.stem(cleaned_word),
lancaster.stem(cleaned_word),
snowball.stem(cleaned_word),
wordnet.lemmatize(cleaned_word),
str(hannover.analyze(cleaned_word)[0]),
str([token.lemma_ for token in spc(cleaned_word)][0]),
str([token.lemma_ for token in iwnlp(cleaned_word)][0])
))
@vicru

This comment has been minimized.

Copy link

@vicru vicru commented Jun 9, 2021

cannot be opened neither directly on github web, github desktop nor on conda =(
Yields an error about a wrong json file

@nickyreinert

This comment has been minimized.

Copy link
Owner Author

@nickyreinert nickyreinert commented Jun 9, 2021

cannot be opened neither directly on github web, github desktop nor on conda =(
Yields an error about a wrong json file

Try the button to display the sourcecode... The notebook itself doesn't work for me in the browser either.

@vicru

This comment has been minimized.

Copy link

@vicru vicru commented Jun 9, 2021

cannot be opened neither directly on github web, github desktop nor on conda =(
Yields an error about a wrong json file

Try the button to display the sourcecode... The notebook itself doesn't work for me in the browser either.

thx, worked out!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment