Created
November 6, 2017 20:52
-
-
Save danemacaulay/ecad283b605c263e27eff1529b8c4a82 to your computer and use it in GitHub Desktop.
sci kit learn nltk preprocessor
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import string | |
from numpy import array | |
from nltk.corpus import stopwords as sw | |
from nltk.corpus import wordnet as wn | |
from nltk import wordpunct_tokenize | |
from nltk import WordNetLemmatizer | |
from nltk import sent_tokenize | |
from nltk import pos_tag | |
from sklearn.base import BaseEstimator, TransformerMixin | |
class NLTKPreprocessor(BaseEstimator, TransformerMixin): | |
def __init__(self, stopwords=None, punct=None, | |
lower=True, strip=True): | |
self.lower = lower | |
self.strip = strip | |
self.stopwords = stopwords or set(sw.words('english')) | |
self.punct = punct or set(string.punctuation) | |
self.lemmatizer = WordNetLemmatizer() | |
self.exluded_pos_tags = ["FW", "CD", "SYM"] | |
def fit(self, X, y=None): | |
return self | |
def inverse_transform(self, X): | |
return [" ".join(doc) for doc in X] | |
def transform(self, X): | |
return [ | |
' '.join(list(self.tokenize(doc))) for doc in X | |
] | |
def tokenize(self, document): | |
# Break the document into sentences | |
for sent in sent_tokenize(document): | |
# Break the sentence into part of speech tagged tokens | |
for token, tag in pos_tag(wordpunct_tokenize(sent)): | |
# Apply preprocessing to the token | |
token = token.lower() if self.lower else token | |
token = token.strip() if self.strip else token | |
token = token.strip('_') if self.strip else token | |
token = token.strip('*') if self.strip else token | |
# If stopword, ignore token and continue | |
if token in self.stopwords: | |
continue | |
# remove certain POS, this didnt help much | |
if tag in self.exluded_pos_tags: | |
continue | |
# If punctuation, ignore token and continue | |
if all(char in self.punct for char in token): | |
continue | |
# Lemmatize the token and yield | |
lemma = self.lemmatize(token, tag) | |
yield lemma | |
def lemmatize(self, token, tag): | |
tag = { | |
'N': wn.NOUN, | |
'V': wn.VERB, | |
'R': wn.ADV, | |
'J': wn.ADJ | |
}.get(tag[0], wn.NOUN) | |
return self.lemmatizer.lemmatize(token, tag) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
usage example: