Skip to content

Instantly share code, notes, and snippets.

@danemacaulay
Created November 6, 2017 20:52
Show Gist options
  • Save danemacaulay/ecad283b605c263e27eff1529b8c4a82 to your computer and use it in GitHub Desktop.
Save danemacaulay/ecad283b605c263e27eff1529b8c4a82 to your computer and use it in GitHub Desktop.
sci kit learn nltk preprocessor
import string
from numpy import array
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag
from sklearn.base import BaseEstimator, TransformerMixin
class NLTKPreprocessor(BaseEstimator, TransformerMixin):
def __init__(self, stopwords=None, punct=None,
lower=True, strip=True):
self.lower = lower
self.strip = strip
self.stopwords = stopwords or set(sw.words('english'))
self.punct = punct or set(string.punctuation)
self.lemmatizer = WordNetLemmatizer()
self.exluded_pos_tags = ["FW", "CD", "SYM"]
def fit(self, X, y=None):
return self
def inverse_transform(self, X):
return [" ".join(doc) for doc in X]
def transform(self, X):
return [
' '.join(list(self.tokenize(doc))) for doc in X
]
def tokenize(self, document):
# Break the document into sentences
for sent in sent_tokenize(document):
# Break the sentence into part of speech tagged tokens
for token, tag in pos_tag(wordpunct_tokenize(sent)):
# Apply preprocessing to the token
token = token.lower() if self.lower else token
token = token.strip() if self.strip else token
token = token.strip('_') if self.strip else token
token = token.strip('*') if self.strip else token
# If stopword, ignore token and continue
if token in self.stopwords:
continue
# remove certain POS, this didnt help much
if tag in self.exluded_pos_tags:
continue
# If punctuation, ignore token and continue
if all(char in self.punct for char in token):
continue
# Lemmatize the token and yield
lemma = self.lemmatize(token, tag)
yield lemma
def lemmatize(self, token, tag):
tag = {
'N': wn.NOUN,
'V': wn.VERB,
'R': wn.ADV,
'J': wn.ADJ
}.get(tag[0], wn.NOUN)
return self.lemmatizer.lemmatize(token, tag)
@danemacaulay
Copy link
Author

danemacaulay commented Nov 6, 2017

usage example:

Pipeline([
            ('preprocessor', NLTKPreprocessor()),
            ('vectorizer', TfidfVectorizer()),
            ('classifier', SomeClassifier()),
        ])

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment