Skip to content

Instantly share code, notes, and snippets.

@yashvijay yashvijay/preprocess.py
Last active Aug 1, 2019

Embed
What would you like to do?
import nltk
nltk.download('stopwords', download_dir='/tmp')
nltk.download('punkt', download_dir='/tmp')
nltk.download('averaged_perceptron_tagger', download_dir='/tmp')
nltk.download('wordnet', download_dir='/tmp')
nltk.data.path.append('/tmp')
import re
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()
stopWords = set(stopwords.words('english'))
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return ''
def lemmatize(word):
try:
if word[-1] == '.':
return lemmatizer.lemmatize(word[:-1] ,get_wordnet_pos(nltk.pos_tag([word])[0][1]))
else:
return lemmatizer.lemmatize(word ,get_wordnet_pos(nltk.pos_tag([word])[0][1]))
except (KeyError,IndexError):
return word
def clean(X):
result = []
for data in X:
tokens = nltk.word_tokenize(str(data).lower())
tokens = [lemmatize(token) for token in tokens if token not in stopWords]
result.append(re.sub('\s+', ' ', ' '.join(tokens).translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))).strip())
return result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.