Last active
August 27, 2017 05:56
-
-
Save wassname/00b94dc7eb7220c136685fce782be3a4 to your computer and use it in GitHub Desktop.
Try all dummy models in sklearn
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# from https://gist.github.com/wassname/00b94dc7eb7220c136685fce782be3a4 | |
from io import StringIO | |
import pandas as pd | |
import numpy as np | |
from sklearn import metrics | |
import sklearn | |
def parse_classification_report(classification_report): | |
"""Parse a sklearn classification report to a dict.""" | |
return pd.read_fwf( | |
StringIO(classification_report), | |
index_col=0, | |
colspecs=[(0, 12), (12, 22), (22, 32), (32, 42), (42, 52)] | |
).dropna() | |
# test | |
s = metrics.classification_report(np.random.random(100) > 0.5, np.random.random(100) > 0.5) | |
d = parse_classification_report(s).to_dict() | |
assert isinstance(d, dict) | |
from sklearn.dummy import DummyClassifier, DummyRegressor | |
from sklearn.model_selection import train_test_split | |
import collections | |
def find_best_dummy_classification(X, y, test_size=0.3, random_state=0, thresh=0.5, target_names=None): | |
"""Try all dummy models.""" | |
X = X.reshape((len(X) ,-1)) | |
# y = y.reshape((len(y) ,-1)) | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state) | |
dummy_scores = [] | |
for strategy in ['most_frequent', 'uniform', 'prior', 'stratified']: | |
clf = DummyClassifier(strategy=strategy) | |
clf.fit(X_train, y_train) | |
y_pred = clf.predict(X_test) | |
score = clf.score(X_test, y_test) | |
report=parse_classification_report(sklearn.metrics.classification_report(y_test > thresh, y_pred > thresh, target_names=target_names)) | |
dummy_scores.append( | |
collections.OrderedDict( | |
strategy='classifier_' + strategy, | |
f1score=report['f1-score']['avg / total'], | |
score=score, | |
report=report | |
) | |
) | |
for strategy in ['mean', 'median']: | |
clf=DummyRegressor(strategy=strategy) | |
clf.fit(X_train, y_train) | |
y_pred=clf.predict(X_test) | |
score=clf.score(X_test, y_test) | |
report=parse_classification_report(sklearn.metrics.classification_report(y_test > thresh, y_pred > thresh, target_names=target_names)) | |
dummy_scores.append( | |
collections.OrderedDict( | |
strategy='regressor_' + strategy, | |
f1score=report['f1-score']['avg / total'], | |
score=score, | |
report=report | |
) | |
) | |
df=pd.DataFrame(dummy_scores) | |
df=df.sort_values('f1score', ascending=False) | |
df.index=df.strategy | |
df=df.drop('strategy',axis=1) | |
return df, df[:1].iloc[0].to_dict() | |
# Test on random data | |
import numpy as np | |
X=np.random.random((100,10,10,10)) | |
y=np.random.random((100))>0.5 | |
df, best_dummy = find_best_dummy_classification(X,y) | |
print(best_dummy) | |
df | |
""" | |
# example result | |
{'matthews_corrcoef': 0.1414213562373095, | |
'report': precision recall f1-score support | |
False 0.55 0.73 0.63 15.0 | |
True 0.60 0.40 0.48 15.0 | |
avg / total 0.57 0.57 0.55 30.0, | |
'score': 0.56666666666666665} | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment