Skip to content

Instantly share code, notes, and snippets.

@oneryalcin
Last active August 6, 2019 12:09
Show Gist options
  • Save oneryalcin/564a93ed9bf0adfe2fe4dca701508875 to your computer and use it in GitHub Desktop.
Save oneryalcin/564a93ed9bf0adfe2fe4dca701508875 to your computer and use it in GitHub Desktop.
sklearn's FeatureUnion in Action. We use additional custom defined StartingVerbExtractor to use in our pipeline in parallel.
import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger'])
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.base import BaseEstimator, TransformerMixin
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
def tokenize(text):
detected_urls = re.findall(url_regex, text)
for url in detected_urls:
text = text.replace(url, "urlplaceholder")
tokens = word_tokenize(text)
lemmatizer = WordNetLemmatizer()
clean_tokens = []
for tok in tokens:
clean_tok = lemmatizer.lemmatize(tok).lower().strip()
clean_tokens.append(clean_tok)
return clean_tokens
class StartingVerbExtractor(BaseEstimator, TransformerMixin):
""" Custom Sklearn transformer for feature union example"""
def starting_verb(self, text):
sentence_list = nltk.sent_tokenize(text)
for sentence in sentence_list:
pos_tags = nltk.pos_tag(tokenize(sentence))
first_word, first_tag = pos_tags[0]
if first_tag in ['VB', 'VBP'] or first_word == 'RT':
return True
return False
def fit(self, x, y=None):
return self
def transform(self, X):
X_tagged = pd.Series(X).apply(self.starting_verb)
return pd.DataFrame(X_tagged)
import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger'])
import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from custom_transformer import StartingVerbExtractor, tokenize
def load_data():
url = 'https://d1p17r2m4rzlbo.cloudfront.net/wp-content/uploads/2016/03/Corporate-messaging-DFE.csv'
df = pd.read_csv(url, encoding='latin-1')
df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
X = df.text.values
y = df.category.values
return X, y
def model_pipeline():
pipeline = Pipeline([
('features', FeatureUnion([
('text_pipeline', Pipeline([
('vect', CountVectorizer(tokenizer=tokenize)),
('tfidf', TfidfTransformer())
])),
('starting_verb', StartingVerbExtractor())
])),
('clf', RandomForestClassifier())
])
return pipeline
def display_results(y_test, y_pred):
labels = np.unique(y_pred)
confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
accuracy = (y_pred == y_test).mean()
print("Labels:", labels)
print("Confusion Matrix:\n", confusion_mat)
print("Accuracy:", accuracy)
def main():
X, y = load_data()
X_train, X_test, y_train, y_test = train_test_split(X, y)
model = model_pipeline()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
display_results(y_test, y_pred)
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment