Skip to content

Instantly share code, notes, and snippets.

@yuriybash
Created January 24, 2019 21:36
Show Gist options
  • Save yuriybash/8ba550c0481af242d5815b9ee13e3b73 to your computer and use it in GitHub Desktop.
Save yuriybash/8ba550c0481af242d5815b9ee13e3b73 to your computer and use it in GitHub Desktop.
gridsearch
import sys
import yaml
from os.path import dirname, join
import pandas as pd
import numpy as np
import random as rnd
from scipy import sparse
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, cross_val_predict
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
VECTORIZERS = {
'count': CountVectorizer,
'tfidf': TfidfVectorizer,
}
def get_vectorizer_cls(name):
try:
v_cls = VECTORIZERS[name]
return v_cls
except KeyError:
print "vectorizer name: ", name
raise VectorizerLoadException("Invalid vectorizer name: %s" % name)
# except TypeError:
# raise VectorizerLoadException("Invalid vectorizer kwargs: %s" % v_kwargs)
ESTIMATORS = {
'DecisionTreeClassifier': DecisionTreeClassifier,
'LinearSVC': LinearSVC,
'MultinomialNB': MultinomialNB,
'SVC': SVC,
'RandomForestClassifier': RandomForestClassifier,
'GaussianNB': GaussianNB,
'Perceptron': Perceptron,
'SGDClassifier': SGDClassifier,
'KNeighborsClassifier': KNeighborsClassifier,
'LogisticRegression': LogisticRegression
}
def get_estimator_cls(name):
try:
e_cls = ESTIMATORS[name]
return e_cls
except KeyError:
raise EstimatorLoadException("Invalid estimator name: %s" % name)
# except TypeError:
# raise EstimatorLoadException("Invalid estimator kwargs: %s" % e_kwargs)
def parse_data(data_path):
with open(join(dirname(dirname(__file__)), data_path)) as f:
data_df = pd.read_csv(f)
return data_df
def train_test_models():
with open(join(dirname(dirname(__file__)), 'config.yml')) as f:
config = yaml.safe_load(f)
data_df = parse_data(config['data'])
train_models(config['models'], data_df, config)
def train_models(models, data, config):
for id_, model_cfg in models.items():
train_model(model_cfg, data, config['cross_validation'], config['test'])
break
class ItemSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key]
def train_model(model_cfg, data_df, cv_cfg, test_cfg):
vectorizer_cfg = model_cfg['vectorizer']
estimator_cfg = model_cfg['estimator']
title_cfg = vectorizer_cfg['title']
url_cfg = vectorizer_cfg['url']
pipeline = Pipeline([
('union', FeatureUnion(
transformer_list=[
('title', Pipeline([
('selector', ItemSelector(key='title')),
('vec', get_vectorizer_cls(title_cfg['name'])()),
])),
('url', Pipeline([
('selector', ItemSelector(key='url')),
('vec', get_vectorizer_cls(url_cfg['name'])()),
])),
],
)),
('estimator_cls', get_estimator_cls(estimator_cfg['name'])()),
])
parameters = []
for title_parameter_group in vectorizer_cfg['title']['parameters']:
for url_parameter_group in vectorizer_cfg['url']['parameters']:
for estimator_parameter_group in estimator_cfg['parameters']:
combined_param_group = {}
for t_key, t_val in title_parameter_group.iteritems():
combined_param_group['__'.join(['union', 'title', 'vec', t_key])] = t_val
for u_key, u_val in url_parameter_group.iteritems():
combined_param_group['__'.join(['union', 'url', 'vec', u_key])] = u_val
for e_key, e_val in estimator_parameter_group.iteritems():
combined_param_group['__'.join(['estimator_cls', e_key])] = e_val
parameters.append(combined_param_group)
X_train, X_test, Y_train, Y_test = train_test_split(
data_df[['title', 'url']],
data_df['noneng'],
test_size=float(cv_cfg['train_test_split'][-1])/100, random_state=42
)
for score in test_cfg['scores']:
print("# Tuning hyper-parameters for %s\n" % score)
clf = GridSearchCV(pipeline, parameters, cv=2, scoring='%s_macro' % score)
clf.fit(X_train, Y_train)
print("Best parameters set found on development set:\n")
print(clf.best_params_)
print("Grid scores on development set:\n")
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
print("%0.3f (+/-%0.03f) for %r"
% (mean, std * 2, params))
print()
print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = Y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))
class VectorizerLoadException(Exception):
"""
Error initializing vectorizer instance
"""
class EstimatorLoadException(Exception):
"""
Error initializing estimator instance
"""
if __name__ == '__main__':
train_test_models()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment