Skip to content

Instantly share code, notes, and snippets.

@hendra-herviawan
Last active January 3, 2018 10:16
Show Gist options
  • Save hendra-herviawan/922cbefab4af5623cf1ace1c79ac7bcc to your computer and use it in GitHub Desktop.
Save hendra-herviawan/922cbefab4af5623cf1ace1c79ac7bcc to your computer and use it in GitHub Desktop.
# https://bbengfort.github.io/tutorials/2016/05/19/text-classification-nltk-sckit-learn.html
import string
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag
from sklearn.base import BaseEstimator, TransformerMixin
class NLTKPreprocessor(BaseEstimator, TransformerMixin):
def __init__(self, stopwords=None, punct=None,
lower=True, strip=True):
self.lower = lower
self.strip = strip
self.stopwords = stopwords or set(sw.words('english'))
self.punct = punct or set(string.punctuation)
self.lemmatizer = WordNetLemmatizer()
def fit(self, X, y=None):
return self
def inverse_transform(self, X):
return [" ".join(doc) for doc in X]
def transform(self, X):
return [
list(self.tokenize(doc)) for doc in X
]
def tokenize(self, document):
# Break the document into sentences
for sent in sent_tokenize(document):
# Break the sentence into part of speech tagged tokens
for token, tag in pos_tag(wordpunct_tokenize(sent)):
# Apply preprocessing to the token
token = token.lower() if self.lower else token
token = token.strip() if self.strip else token
token = token.strip('_') if self.strip else token
token = token.strip('*') if self.strip else token
# If stopword, ignore token and continue
if token in self.stopwords:
continue
# If punctuation, ignore token and continue
if all(char in self.punct for char in token):
continue
# Lemmatize the token and yield
lemma = self.lemmatize(token, tag)
yield lemma
def lemmatize(self, token, tag):
tag = {
'N': wn.NOUN,
'V': wn.VERB,
'R': wn.ADV,
'J': wn.ADJ
}.get(tag[0], wn.NOUN)
return self.lemmatizer.lemmatize(token, tag)
#https://github.com/ChenglongChen/Kaggle_HomeDepot/blob/55c1033d0af3b6cf2f033fe4bcf3e1e0ffda3445/Code/Chenglong/data_processor.py
import nltk
class Lemmatizer:
def __init__(self):
self.Tokenizer = nltk.tokenize.TreebankWordTokenizer()
self.Lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
def transform(self, text):
tokens = [self.Lemmatizer.lemmatize(token) for token in self.Tokenizer.tokenize(text)]
return " ".join(tokens)
## stemming
class Stemmer:
def __init__(self, stemmer_type="snowball"):
self.stemmer_type = stemmer_type
if self.stemmer_type == "porter":
self.stemmer = nltk.stem.PorterStemmer()
elif self.stemmer_type == "snowball":
self.stemmer = nltk.stem.SnowballStemmer("english")
def transform(self, text):
tokens = [self.stemmer.stem(token) for token in text.split(" ")]
return " ".join(tokens)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment