wassname/find_best_dummy_classification.py

## find_best_dummy_classification.py
# from https://gist.github.com/wassname/00b94dc7eb7220c136685fce782be3a4
from io import StringIO
import pandas as pd
import numpy as np
from sklearn import metrics
import sklearn

def parse_classification_report(classification_report):
    """Parse a sklearn classification report to a dict."""
    return pd.read_fwf(
        StringIO(classification_report),
        index_col=0,
        colspecs=[(0, 12), (12, 22), (22, 32), (32, 42), (42, 52)]
    ).dropna()


# test
s = metrics.classification_report(np.random.random(100) > 0.5, np.random.random(100) > 0.5)
d = parse_classification_report(s).to_dict()
assert isinstance(d, dict)

from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.model_selection import train_test_split
import collections

def find_best_dummy_classification(X, y, test_size=0.3, random_state=0, thresh=0.5, target_names=None):
    """Try all dummy models."""
    X = X.reshape((len(X) ,-1))
#     y = y.reshape((len(y) ,-1))

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    dummy_scores = []
    for strategy in ['most_frequent', 'uniform', 'prior', 'stratified']:
        clf = DummyClassifier(strategy=strategy)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        score = clf.score(X_test, y_test)
        report=parse_classification_report(sklearn.metrics.classification_report(y_test > thresh, y_pred > thresh, target_names=target_names))

        dummy_scores.append(
            collections.OrderedDict(
                strategy='classifier_' + strategy,
                f1score=report['f1-score']['avg / total'],
                score=score,
                report=report
            )
        )

    for strategy in ['mean', 'median']:
        clf=DummyRegressor(strategy=strategy)
        clf.fit(X_train, y_train)
        y_pred=clf.predict(X_test)
        score=clf.score(X_test, y_test)
        report=parse_classification_report(sklearn.metrics.classification_report(y_test > thresh, y_pred > thresh, target_names=target_names))

        dummy_scores.append(
            collections.OrderedDict(
                strategy='regressor_' + strategy,
                f1score=report['f1-score']['avg / total'],
                score=score,
                report=report
            )
            )

    df=pd.DataFrame(dummy_scores)
    df=df.sort_values('f1score', ascending=False)
    df.index=df.strategy
    df=df.drop('strategy',axis=1)
    return df, df[:1].iloc[0].to_dict()


# Test on random data
import numpy as np
X=np.random.random((100,10,10,10))
y=np.random.random((100))>0.5
df, best_dummy = find_best_dummy_classification(X,y)
print(best_dummy)
df

"""
# example result
{'matthews_corrcoef': 0.1414213562373095,
 'report':              precision  recall  f1-score  support
 False             0.55    0.73      0.63     15.0
 True              0.60    0.40      0.48     15.0
 avg / total       0.57    0.57      0.55     30.0,
 'score': 0.56666666666666665}
 """
	# from https://gist.github.com/wassname/00b94dc7eb7220c136685fce782be3a4
	from io import StringIO
	import pandas as pd
	import numpy as np
	from sklearn import metrics
	import sklearn

	def parse_classification_report(classification_report):
	"""Parse a sklearn classification report to a dict."""
	return pd.read_fwf(
	StringIO(classification_report),
	index_col=0,
	colspecs=[(0, 12), (12, 22), (22, 32), (32, 42), (42, 52)]
	).dropna()


	# test
	s = metrics.classification_report(np.random.random(100) > 0.5, np.random.random(100) > 0.5)
	d = parse_classification_report(s).to_dict()
	assert isinstance(d, dict)

	from sklearn.dummy import DummyClassifier, DummyRegressor
	from sklearn.model_selection import train_test_split
	import collections

	def find_best_dummy_classification(X, y, test_size=0.3, random_state=0, thresh=0.5, target_names=None):
	"""Try all dummy models."""
	X = X.reshape((len(X) ,-1))
	# y = y.reshape((len(y) ,-1))

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

	dummy_scores = []
	for strategy in ['most_frequent', 'uniform', 'prior', 'stratified']:
	clf = DummyClassifier(strategy=strategy)
	clf.fit(X_train, y_train)
	y_pred = clf.predict(X_test)
	score = clf.score(X_test, y_test)
	report=parse_classification_report(sklearn.metrics.classification_report(y_test > thresh, y_pred > thresh, target_names=target_names))

	dummy_scores.append(
	collections.OrderedDict(
	strategy='classifier_' + strategy,
	f1score=report['f1-score']['avg / total'],
	score=score,
	report=report
	)
	)

	for strategy in ['mean', 'median']:
	clf=DummyRegressor(strategy=strategy)
	clf.fit(X_train, y_train)
	y_pred=clf.predict(X_test)
	score=clf.score(X_test, y_test)
	report=parse_classification_report(sklearn.metrics.classification_report(y_test > thresh, y_pred > thresh, target_names=target_names))

	dummy_scores.append(
	collections.OrderedDict(
	strategy='regressor_' + strategy,
	f1score=report['f1-score']['avg / total'],
	score=score,
	report=report
	)
	)

	df=pd.DataFrame(dummy_scores)
	df=df.sort_values('f1score', ascending=False)
	df.index=df.strategy
	df=df.drop('strategy',axis=1)
	return df, df[:1].iloc[0].to_dict()


	# Test on random data
	import numpy as np
	X=np.random.random((100,10,10,10))
	y=np.random.random((100))>0.5
	df, best_dummy = find_best_dummy_classification(X,y)
	print(best_dummy)
	df

	"""
	# example result
	{'matthews_corrcoef': 0.1414213562373095,
	'report': precision recall f1-score support
	False 0.55 0.73 0.63 15.0
	True 0.60 0.40 0.48 15.0
	avg / total 0.57 0.57 0.55 30.0,
	'score': 0.56666666666666665}
	"""