Skip to content

Instantly share code, notes, and snippets.

@mneedham
Last active December 1, 2020 21:56
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save mneedham/3936d657b50b7c07cd3fe0c8d8c71496 to your computer and use it in GitHub Desktop.
Save mneedham/3936d657b50b7c07cd3fe0c8d8c71496 to your computer and use it in GitHub Desktop.
Spooky Author Identification - Grid search for VotingClassifier
import pandas as pd
from sklearn import linear_model
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
Y_COLUMN = "author"
TEXT_COLUMN = "text"
unigram_log_pipe = Pipeline([
('cv', CountVectorizer()),
('logreg', linear_model.LogisticRegression())
])
ngram_pipe = Pipeline([
('cv', CountVectorizer(ngram_range=(1, 2))),
('mnb', MultinomialNB())
])
tfidf_pipe = Pipeline([
('tfidf', TfidfVectorizer(min_df=3, max_features=None,
strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1,
stop_words='english')),
('mnb', MultinomialNB())
])
classifiers = [
("ngram", ngram_pipe),
("unigram", unigram_log_pipe),
("tfidf", tfidf_pipe),
]
mixed_pipe = Pipeline([
("voting", VotingClassifier(classifiers, voting="soft"))
])
def combinations_on_off(num_classifiers):
return [[int(x) for x in list("{0:0b}".format(i).zfill(num_classifiers))]
for i in range(1, 2 ** num_classifiers)]
param_grid = dict(
voting__weights=combinations_on_off(len(classifiers))
)
train_df = pd.read_csv("train.csv", usecols=[Y_COLUMN, TEXT_COLUMN])
y = train_df[Y_COLUMN].copy()
X = pd.Series(train_df[TEXT_COLUMN])
grid_search = GridSearchCV(mixed_pipe, param_grid=param_grid, n_jobs=-1, verbose=10, scoring="neg_log_loss")
grid_search.fit(X, y)
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results["mean_test_score"], cv_results["params"]):
print(params, mean_score)
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment