Skip to content

Instantly share code, notes, and snippets.

@raven4752
Created November 7, 2018 04:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save raven4752/56eab728df8ae53619a4307f38ebecd9 to your computer and use it in GitHub Desktop.
Save raven4752/56eab728df8ae53619a4307f38ebecd9 to your computer and use it in GitHub Desktop.
machine learning routine code
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
class ModelTester:
def __init__(self, ds_dir='adult_dataset'):
xtr_path = os.path.join(ds_dir, 'xtr.txt')
ytr_path = os.path.join(ds_dir, 'ytr.txt')
xte_path = os.path.join(ds_dir, 'xte.txt')
yte_path = os.path.join(ds_dir, 'yte.txt')
self.xtr = np.genfromtxt(xtr_path, delimiter=' ')
self.ytr = np.genfromtxt(ytr_path, delimiter=' ')
self.xte = np.genfromtxt(xte_path, delimiter=' ')
self.yte = np.genfromtxt(yte_path, delimiter=' ')
def cross_valid_model_predictions(self, model_train_predict_func, num_folds=10, seed=1, **kwargs):
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
predictions_all_fold = []
label_all_fold = []
for tr_index, te_index in kfold.split(self.xtr):
xgr = self.xtr[tr_index]
ygr = self.ytr[tr_index]
xva = self.xtr[te_index]
yva = self.ytr[te_index]
predictions = model_train_predict_func(xgr, ygr, xva, yva, **kwargs)
predictions_all_fold.append(predictions)
label_all_fold.append(yva)
return np.concatenate(predictions_all_fold), np.concatenate(label_all_fold)
def cross_valid_model(self, model_train_predict_func, num_folds=10, seed=1, score='auc', **kwargs):
p, l = self.cross_valid_model_predictions(model_train_predict_func=model_train_predict_func,
num_folds=num_folds, seed=seed, **kwargs)
if score == 'auc':
return roc_auc_score(l, p)
def predict_test(self, model_train_predict_func):
return model_train_predict_func(self.xtr, self.ytr, self.xte, self.yte)
def rf_func(xtr, ytr, xte, yte, **kwargs):
cls = RandomForestClassifier(**kwargs)
cls.fit(xtr, ytr)
return cls.predict_proba(xte)[:, 1]
def adaboost_func(xtr, ytr, xte, yte, **kwargs):
cls = AdaBoostClassifier(**kwargs)
cls.fit(xtr, ytr)
return cls.predict_proba(xte)[:, 1]
def get_benchmark_score(n_estimators=50):
mt = ModelTester()
score_rf = mt.cross_valid_model(rf_func, n_estimators=n_estimators)
score_ada = mt.cross_valid_model(adaboost_func, n_estimators=n_estimators)
print(score_rf) # 0.900185583187
print(score_ada) # 0.909267081712
return score_rf, score_ada
def plot_benchmark_scores(start=10, end=110, step=10):
scores = []
for i in range(start, end, step):
scores.append(get_benchmark_score(n_estimators=i))
plt.figure()
plt.title('roc-auc score of random forest and adaboost')
plt.xlabel('num estimator')
plt.ylabel('roc-auc score')
x = np.arange(start=start, stop=end, step=step)
plt.xticks(x)
plt.plot(x, list(sc[0] for sc in scores), label='RandomForest')
plt.plot(x, list(sc[1] for sc in scores), label='AdaBoost')
plt.legend()
plt.show()
if __name__ == '__main__':
plot_benchmark_scores()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment