Skip to content

Instantly share code, notes, and snippets.

@yashvijay yashvijay/
Last active Aug 1, 2019

What would you like to do?
import nltk'stopwords', download_dir='/tmp')'punkt', download_dir='/tmp')'averaged_perceptron_tagger', download_dir='/tmp')'wordnet', download_dir='/tmp')'/tmp')
import re
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()
stopWords = set(stopwords.words('english'))
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
return ''
def lemmatize(word):
if word[-1] == '.':
return lemmatizer.lemmatize(word[:-1] ,get_wordnet_pos(nltk.pos_tag([word])[0][1]))
return lemmatizer.lemmatize(word ,get_wordnet_pos(nltk.pos_tag([word])[0][1]))
except (KeyError,IndexError):
return word
def clean(X):
result = []
for data in X:
tokens = nltk.word_tokenize(str(data).lower())
tokens = [lemmatize(token) for token in tokens if token not in stopWords]
result.append(re.sub('\s+', ' ', ' '.join(tokens).translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))).strip())
return result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.