Skip to content

Instantly share code, notes, and snippets.

@oneryalcin
Last active August 6, 2019 13:19
Show Gist options
  • Save oneryalcin/b3d145e873f0e7fe0e0fff55dac80973 to your computer and use it in GitHub Desktop.
Save oneryalcin/b3d145e873f0e7fe0e0fff55dac80973 to your computer and use it in GitHub Desktop.
Sklearn's GridSearchCV with Pipelines
import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger'])
import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
class StartingVerbExtractor(BaseEstimator, TransformerMixin):
def starting_verb(self, text):
sentence_list = nltk.sent_tokenize(text)
for sentence in sentence_list:
pos_tags = nltk.pos_tag(tokenize(sentence))
first_word, first_tag = pos_tags[0]
if first_tag in ['VB', 'VBP'] or first_word == 'RT':
return True
return False
def fit(self, x, y=None):
return self
def transform(self, X):
X_tagged = pd.Series(X).apply(self.starting_verb)
return pd.DataFrame(X_tagged)
def load_data():
url = 'https://d1p17r2m4rzlbo.cloudfront.net/wp-content/uploads/2016/03/Corporate-messaging-DFE.csv'
df = pd.read_csv(url, encoding='latin-1')
df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
X = df.text.values
y = df.category.values
return X, y
def tokenize(text):
detected_urls = re.findall(url_regex, text)
for url in detected_urls:
text = text.replace(url, "urlplaceholder")
tokens = word_tokenize(text)
lemmatizer = WordNetLemmatizer()
clean_tokens = []
for tok in tokens:
clean_tok = lemmatizer.lemmatize(tok).lower().strip()
clean_tokens.append(clean_tok)
return clean_tokens
def build_model():
pipeline = Pipeline([
('features', FeatureUnion([
('text_pipeline', Pipeline([
('vect', CountVectorizer(tokenizer=tokenize)),
('tfidf', TfidfTransformer())
])),
('starting_verb', StartingVerbExtractor())
])),
('clf', RandomForestClassifier())
])
parameters = {
'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)),
'features__text_pipeline__vect__max_df': (0.5, 0.75, 1.0),
'features__text_pipeline__vect__max_features': (None, 5000, 10000),
'features__text_pipeline__tfidf__use_idf': (True, False),
'clf__n_estimators': [50, 100, 200],
'clf__min_samples_split': [2, 3, 4],
'features__transformer_weights': (
{'text_pipeline': 1, 'starting_verb': 0.5},
{'text_pipeline': 0.5, 'starting_verb': 1},
{'text_pipeline': 0.8, 'starting_verb': 1},
)
}
cv = GridSearchCV(pipeline, param_grid=parameters)
return cv
def display_results(cv, y_test, y_pred):
labels = np.unique(y_pred)
confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
accuracy = (y_pred == y_test).mean()
print("Labels:", labels)
print("Confusion Matrix:\n", confusion_mat)
print("Accuracy:", accuracy)
print("\nBest Parameters:", cv.best_params_)
def main():
X, y = load_data()
X_train, X_test, y_train, y_test = train_test_split(X, y)
model = build_model()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
display_results(model, y_test, y_pred)
main()
@oneryalcin
Copy link
Author

An example result would be:

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 92   1  15]
 [  2  32   3]
 [ 14   2 440]]
Accuracy: 0.9384359401

Best Parameters: {'features__transformer_weights': {'text_pipeline': 1, 'verb_feature': 0.5}}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment