Skip to content

Instantly share code, notes, and snippets.

@wassname
Last active August 27, 2017 05:56
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wassname/00b94dc7eb7220c136685fce782be3a4 to your computer and use it in GitHub Desktop.
Save wassname/00b94dc7eb7220c136685fce782be3a4 to your computer and use it in GitHub Desktop.
Try all dummy models in sklearn
# from https://gist.github.com/wassname/00b94dc7eb7220c136685fce782be3a4
from io import StringIO
import pandas as pd
import numpy as np
from sklearn import metrics
import sklearn
def parse_classification_report(classification_report):
"""Parse a sklearn classification report to a dict."""
return pd.read_fwf(
StringIO(classification_report),
index_col=0,
colspecs=[(0, 12), (12, 22), (22, 32), (32, 42), (42, 52)]
).dropna()
# test
s = metrics.classification_report(np.random.random(100) > 0.5, np.random.random(100) > 0.5)
d = parse_classification_report(s).to_dict()
assert isinstance(d, dict)
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.model_selection import train_test_split
import collections
def find_best_dummy_classification(X, y, test_size=0.3, random_state=0, thresh=0.5, target_names=None):
"""Try all dummy models."""
X = X.reshape((len(X) ,-1))
# y = y.reshape((len(y) ,-1))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
dummy_scores = []
for strategy in ['most_frequent', 'uniform', 'prior', 'stratified']:
clf = DummyClassifier(strategy=strategy)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
score = clf.score(X_test, y_test)
report=parse_classification_report(sklearn.metrics.classification_report(y_test > thresh, y_pred > thresh, target_names=target_names))
dummy_scores.append(
collections.OrderedDict(
strategy='classifier_' + strategy,
f1score=report['f1-score']['avg / total'],
score=score,
report=report
)
)
for strategy in ['mean', 'median']:
clf=DummyRegressor(strategy=strategy)
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)
score=clf.score(X_test, y_test)
report=parse_classification_report(sklearn.metrics.classification_report(y_test > thresh, y_pred > thresh, target_names=target_names))
dummy_scores.append(
collections.OrderedDict(
strategy='regressor_' + strategy,
f1score=report['f1-score']['avg / total'],
score=score,
report=report
)
)
df=pd.DataFrame(dummy_scores)
df=df.sort_values('f1score', ascending=False)
df.index=df.strategy
df=df.drop('strategy',axis=1)
return df, df[:1].iloc[0].to_dict()
# Test on random data
import numpy as np
X=np.random.random((100,10,10,10))
y=np.random.random((100))>0.5
df, best_dummy = find_best_dummy_classification(X,y)
print(best_dummy)
df
"""
# example result
{'matthews_corrcoef': 0.1414213562373095,
'report': precision recall f1-score support
False 0.55 0.73 0.63 15.0
True 0.60 0.40 0.48 15.0
avg / total 0.57 0.57 0.55 30.0,
'score': 0.56666666666666665}
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment