Skip to content

Instantly share code, notes, and snippets.

@ak110
Last active June 12, 2017 14:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ak110/e46641928a3c1b7ec71b6072ebe58f55 to your computer and use it in GitHub Desktop.
Save ak110/e46641928a3c1b7ec71b6072ebe58f55 to your computer and use it in GitHub Desktop.
Baggingとその偽物(?)の効果を調べてみた ref: http://qiita.com/ak11/items/55036ac10d08cc4f426f
"""
アンサンブルのお試しコード
"""
import numpy as np
import sklearn.base
import sklearn.ensemble
import sklearn.metrics
import sklearn.model_selection
import sklearn.linear_model
import sklearn.neural_network
import sklearn.datasets
class CVAggregatingClassifier:
def __init__(self, base_estimator, n_fold):
self.base_estimator = base_estimator
self.n_fold = n_fold
self.estimators_ = []
self.cv_score_ = 0
def fit(self, X, y):
self.estimators_ = []
score_list = []
count_list = []
skf = sklearn.model_selection.StratifiedKFold(n_splits=self.n_fold, shuffle=True)
for train, test in skf.split(X, y):
estimator = sklearn.base.clone(self.base_estimator)
estimator.fit(X[train], y[train])
pred = estimator.predict(X[test])
score = sklearn.metrics.accuracy_score(y[test], pred)
score_list.append(score)
count_list.append(len(y[test]))
self.estimators_.append(estimator)
self.cv_score_ = np.average(score_list, weights=count_list)
def predict(self, X):
return np.argmax(self.predict_proba(X), axis=-1)
def predict_proba(self, X):
return np.mean([e.predict_proba(X) for e in self.estimators_], axis=0)
if __name__ == '__main__':
np.random.seed(3456)
mnist = sklearn.datasets.fetch_mldata('MNIST original')
X, y = mnist.data, mnist.target
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=60000.0 / len(y))
print('train size={} test size={}'.format(len(X_train), len(X_test)))
factories = [
('lr', lambda: sklearn.linear_model.LogisticRegression(n_jobs=-1)),
('rf', lambda: sklearn.ensemble.RandomForestClassifier(n_estimators=100, n_jobs=-1)),
('nn', lambda: sklearn.neural_network.MLPClassifier(hidden_layer_sizes=(1000, 1000))),
]
for base_name, factory in factories:
estimators = [
('base', factory()),
('cv5', CVAggregatingClassifier(factory(), n_fold=5)),
('cv10', CVAggregatingClassifier(factory(), n_fold=10)),
('cv15', CVAggregatingClassifier(factory(), n_fold=15)),
('bag5', sklearn.ensemble.BaggingClassifier(factory(), n_estimators=5, oob_score=True)),
('bag10', sklearn.ensemble.BaggingClassifier(factory(), n_estimators=10, oob_score=True)),
('bag15', sklearn.ensemble.BaggingClassifier(factory(), n_estimators=15, oob_score=True)),
]
for name, estimator in estimators:
estimator.fit(X_train, y_train)
y_pred = estimator.predict(X_test)
acc = sklearn.metrics.accuracy_score(y_test, y_pred)
if hasattr(estimator, 'oob_score_'):
print('{}-{:5s}: val_acc={:.2f} oob_score={:.2f}'.format(base_name, name, acc * 100, estimator.oob_score_ * 100))
elif hasattr(estimator, 'cv_score_'):
print('{}-{:5s}: val_acc={:.2f} cv_score={:.2f}'.format(base_name, name, acc * 100, estimator.cv_score_ * 100))
else:
print('{}-{:5s}: val_acc={:.2f}'.format(base_name, name, acc * 100))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment