Skip to content

Instantly share code, notes, and snippets.

@osule
Last active May 17, 2022 06:16
Show Gist options
  • Save osule/34549f0b69db987a487d7e7a0efed100 to your computer and use it in GitHub Desktop.
Save osule/34549f0b69db987a487d7e7a0efed100 to your computer and use it in GitHub Desktop.
Lemmatizing words in a sentence based on part of speech.
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from collections import defaultdict
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
df = pd.DataFrame({
'Abstract': ['The research findings are factual.', '100% of us are gonna die.']
})
lemmatizer = nltk.WordNetLemmatizer()
pos_mappings = defaultdict(lambda: 'n')
# Map supported pos values for WordNetLemmatizer.lemmatize
pos_mappings.update({
'NOUN': 'n',
'VERB': 'v',
'ADJ': 'a',
'ADV': 'r',
'ADJ_SAT': 's',
})
def lemmatize(sentence):
lemmatized_words = nltk.pos_tag(word_tokenize(sentence), 'universal')
return [lemmatizer.lemmatize(word, pos=pos_mappings[pos_tag]) for word, pos_tag in lemmatized_words]
df['Abstract']= df['Abstract'].apply(lemmatize)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment