Skip to content

Instantly share code, notes, and snippets.

@hugoncosta
Last active December 1, 2023 16:41
Show Gist options
  • Save hugoncosta/2fa5684c43724584eb0ae8657d28f80a to your computer and use it in GitHub Desktop.
Save hugoncosta/2fa5684c43724584eb0ae8657d28f80a to your computer and use it in GitHub Desktop.
import nltk
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from nltk.stem.wordnet import WordNetLemmatizer
# Download list
!wget https://github.com/michmech/lemmatization-lists/raw/master/lemmatization-pt.txt -q
nltk.download('stopwords')
nltk.download('rslp')
stopwordspt = set(stopwords.words("portuguese"))
stopwordsen = set(stopwords.words("english"))
ptstemmer = RSLPStemmer()
enstemmer = WordNetLemmatizer()
# Convert to dictionary
lmztpt = {}
dic = open("lemmatization-pt.txt")
for line in dic:
txt = line.split()
lmztpt[txt[1]] = txt[0]
# Lemmatize wherever possible
def PortugueseMess(word):
if word in lmztpt.keys():
return lmztpt.get(word)
else:
return ptstemmer.stem(word)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment