Last active
June 12, 2017 14:02
-
-
Save ak110/e46641928a3c1b7ec71b6072ebe58f55 to your computer and use it in GitHub Desktop.
Baggingとその偽物(?)の効果を調べてみた ref: http://qiita.com/ak11/items/55036ac10d08cc4f426f
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
アンサンブルのお試しコード | |
""" | |
import numpy as np | |
import sklearn.base | |
import sklearn.ensemble | |
import sklearn.metrics | |
import sklearn.model_selection | |
import sklearn.linear_model | |
import sklearn.neural_network | |
import sklearn.datasets | |
class CVAggregatingClassifier: | |
def __init__(self, base_estimator, n_fold): | |
self.base_estimator = base_estimator | |
self.n_fold = n_fold | |
self.estimators_ = [] | |
self.cv_score_ = 0 | |
def fit(self, X, y): | |
self.estimators_ = [] | |
score_list = [] | |
count_list = [] | |
skf = sklearn.model_selection.StratifiedKFold(n_splits=self.n_fold, shuffle=True) | |
for train, test in skf.split(X, y): | |
estimator = sklearn.base.clone(self.base_estimator) | |
estimator.fit(X[train], y[train]) | |
pred = estimator.predict(X[test]) | |
score = sklearn.metrics.accuracy_score(y[test], pred) | |
score_list.append(score) | |
count_list.append(len(y[test])) | |
self.estimators_.append(estimator) | |
self.cv_score_ = np.average(score_list, weights=count_list) | |
def predict(self, X): | |
return np.argmax(self.predict_proba(X), axis=-1) | |
def predict_proba(self, X): | |
return np.mean([e.predict_proba(X) for e in self.estimators_], axis=0) | |
if __name__ == '__main__': | |
np.random.seed(3456) | |
mnist = sklearn.datasets.fetch_mldata('MNIST original') | |
X, y = mnist.data, mnist.target | |
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=60000.0 / len(y)) | |
print('train size={} test size={}'.format(len(X_train), len(X_test))) | |
factories = [ | |
('lr', lambda: sklearn.linear_model.LogisticRegression(n_jobs=-1)), | |
('rf', lambda: sklearn.ensemble.RandomForestClassifier(n_estimators=100, n_jobs=-1)), | |
('nn', lambda: sklearn.neural_network.MLPClassifier(hidden_layer_sizes=(1000, 1000))), | |
] | |
for base_name, factory in factories: | |
estimators = [ | |
('base', factory()), | |
('cv5', CVAggregatingClassifier(factory(), n_fold=5)), | |
('cv10', CVAggregatingClassifier(factory(), n_fold=10)), | |
('cv15', CVAggregatingClassifier(factory(), n_fold=15)), | |
('bag5', sklearn.ensemble.BaggingClassifier(factory(), n_estimators=5, oob_score=True)), | |
('bag10', sklearn.ensemble.BaggingClassifier(factory(), n_estimators=10, oob_score=True)), | |
('bag15', sklearn.ensemble.BaggingClassifier(factory(), n_estimators=15, oob_score=True)), | |
] | |
for name, estimator in estimators: | |
estimator.fit(X_train, y_train) | |
y_pred = estimator.predict(X_test) | |
acc = sklearn.metrics.accuracy_score(y_test, y_pred) | |
if hasattr(estimator, 'oob_score_'): | |
print('{}-{:5s}: val_acc={:.2f} oob_score={:.2f}'.format(base_name, name, acc * 100, estimator.oob_score_ * 100)) | |
elif hasattr(estimator, 'cv_score_'): | |
print('{}-{:5s}: val_acc={:.2f} cv_score={:.2f}'.format(base_name, name, acc * 100, estimator.cv_score_ * 100)) | |
else: | |
print('{}-{:5s}: val_acc={:.2f}'.format(base_name, name, acc * 100)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment