Last active
August 6, 2019 12:09
-
-
Save oneryalcin/564a93ed9bf0adfe2fe4dca701508875 to your computer and use it in GitHub Desktop.
sklearn's FeatureUnion in Action. We use additional custom defined StartingVerbExtractor to use in our pipeline in parallel.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger']) | |
import re | |
import pandas as pd | |
from nltk.tokenize import word_tokenize | |
from nltk.stem import WordNetLemmatizer | |
from sklearn.base import BaseEstimator, TransformerMixin | |
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' | |
def tokenize(text): | |
detected_urls = re.findall(url_regex, text) | |
for url in detected_urls: | |
text = text.replace(url, "urlplaceholder") | |
tokens = word_tokenize(text) | |
lemmatizer = WordNetLemmatizer() | |
clean_tokens = [] | |
for tok in tokens: | |
clean_tok = lemmatizer.lemmatize(tok).lower().strip() | |
clean_tokens.append(clean_tok) | |
return clean_tokens | |
class StartingVerbExtractor(BaseEstimator, TransformerMixin): | |
""" Custom Sklearn transformer for feature union example""" | |
def starting_verb(self, text): | |
sentence_list = nltk.sent_tokenize(text) | |
for sentence in sentence_list: | |
pos_tags = nltk.pos_tag(tokenize(sentence)) | |
first_word, first_tag = pos_tags[0] | |
if first_tag in ['VB', 'VBP'] or first_word == 'RT': | |
return True | |
return False | |
def fit(self, x, y=None): | |
return self | |
def transform(self, X): | |
X_tagged = pd.Series(X).apply(self.starting_verb) | |
return pd.DataFrame(X_tagged) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger']) | |
import re | |
import numpy as np | |
import pandas as pd | |
from nltk.tokenize import word_tokenize | |
from nltk.stem import WordNetLemmatizer | |
from sklearn.pipeline import Pipeline, FeatureUnion | |
from sklearn.metrics import confusion_matrix | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.model_selection import train_test_split | |
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer | |
from custom_transformer import StartingVerbExtractor, tokenize | |
def load_data(): | |
url = 'https://d1p17r2m4rzlbo.cloudfront.net/wp-content/uploads/2016/03/Corporate-messaging-DFE.csv' | |
df = pd.read_csv(url, encoding='latin-1') | |
df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')] | |
X = df.text.values | |
y = df.category.values | |
return X, y | |
def model_pipeline(): | |
pipeline = Pipeline([ | |
('features', FeatureUnion([ | |
('text_pipeline', Pipeline([ | |
('vect', CountVectorizer(tokenizer=tokenize)), | |
('tfidf', TfidfTransformer()) | |
])), | |
('starting_verb', StartingVerbExtractor()) | |
])), | |
('clf', RandomForestClassifier()) | |
]) | |
return pipeline | |
def display_results(y_test, y_pred): | |
labels = np.unique(y_pred) | |
confusion_mat = confusion_matrix(y_test, y_pred, labels=labels) | |
accuracy = (y_pred == y_test).mean() | |
print("Labels:", labels) | |
print("Confusion Matrix:\n", confusion_mat) | |
print("Accuracy:", accuracy) | |
def main(): | |
X, y = load_data() | |
X_train, X_test, y_train, y_test = train_test_split(X, y) | |
model = model_pipeline() | |
model.fit(X_train, y_train) | |
y_pred = model.predict(X_test) | |
display_results(y_test, y_pred) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment