Skip to content

Instantly share code, notes, and snippets.

@fkdosilovic
Created October 14, 2022 07:05
Show Gist options
  • Save fkdosilovic/5b89096ef0ce2d0ff3639659ea724070 to your computer and use it in GitHub Desktop.
Save fkdosilovic/5b89096ef0ce2d0ff3639659ea724070 to your computer and use it in GitHub Desktop.
POS Tag informed Lemmatizer
class POSTagLemmatizer:
"""Wrapper around NLTK's WordNetLemmatizer that takes into
account token's POS tag."""
ABBR_TO_TAG = {
"n": ["NN", "NNS", "NNP", "NNPS"],
"v": ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"],
"r": ["RB", "RBR", "RBS"],
"a": ["JJ", "JJR", "JJS"],
}
TAG_TO_ABBR = {tag: abb for abb, tags in ABBR_TO_TAG.items() for tag in tags}
def __init__(self):
from nltk import pos_tag
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
self._tokenize = word_tokenize
self._lemmatizer = WordNetLemmatizer()
self._get_pos_tags = pos_tag
def lemmatize(self, text):
tokens = self._tokenize(text)
pos_tags = self._get_pos_tags(tokens)
return [
self._lemmatizer.lemmatize(tkn, self.get_pos_tag_lw(tag))
for tkn, tag in pos_tags
]
@classmethod
def get_pos_tag_lw(cls, pos_tag):
return cls.TAG_TO_ABBR.get(pos_tag, "n")
def __call__(self, tokens):
return self.lemmatize(tokens)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment