Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save AllieUbisse/e38b8364d66811270dc6af5a002d49a2 to your computer and use it in GitHub Desktop.
Save AllieUbisse/e38b8364d66811270dc6af5a002d49a2 to your computer and use it in GitHub Desktop.
Stacking/Blendingをheamyで、Stacknetをpystacknetで高速に実装する
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.neighbors import KNeighborsRegressor\n",
"from sklearn.neighbors import KNeighborsRegressor\n",
"from sklearn.model_selection import cross_val_score\n",
"from sklearn.datasets import load_boston\n",
"from sklearn.model_selection import KFold\n",
"from sklearn.model_selection import cross_val_score\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import mean_absolute_error\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn.metrics import mean_absolute_error\n",
"from catboost import CatBoostRegressor\n",
"from sklearn.ensemble import ExtraTreesRegressor\n",
"from sklearn.linear_model import Ridge\n",
"from xgboost import XGBRegressor\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"seed = 60"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Dataload"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X, y = load_boston(return_X_y=True)\n",
"X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.05, random_state=seed)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Baseline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"models_dic = {\n",
" 'random forest': RandomForestRegressor(n_estimators=50, random_state=seed),\n",
" 'linear regression': LinearRegression(normalize=True),\n",
" 'knn': KNeighborsRegressor(),\n",
" 'catboost': CatBoostRegressor(custom_metric=['MAE'], random_seed=seed, logging_level='Silent')\n",
"}\n",
"\n",
"for name, model in models_dic.items():\n",
" kfold = KFold(n_splits=10, random_state=seed)\n",
" cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=\"neg_mean_absolute_error\")\n",
" print(f'{name} : {-np.mean(cv_results):.2f}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# heamy"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from heamy.dataset import Dataset\n",
"from heamy.estimator import Regressor, Classifier\n",
"from heamy.pipeline import ModelsPipeline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### stacking"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"# datasetを準備\n",
"dataset = Dataset(X_train, y_train, X_val) # X_testは今回使わないが入れないとエラーになる\n",
"\n",
"# アンサンブルに使うモデルを定義\n",
"models = [\n",
" Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50, 'random_state': seed}, name='rf'),\n",
" Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True}, name='lr'),\n",
" Regressor(dataset=dataset, estimator=KNeighborsRegressor, name='kr'),\n",
" Regressor(dataset=dataset, estimator=CatBoostRegressor, parameters={'custom_metric': ['MAE'], 'random_seed': seed, 'logging_level': 'Silent'}, name='cr')\n",
"]\n",
"\n",
"# pipelineを定義、2nd levelデータセットの作成\n",
"pipeline = ModelsPipeline(*models)\n",
"stack_ds = pipeline.stack(k=10, seed=seed)\n",
"\n",
"# modelを作ってvalidation\n",
"stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)\n",
"y_trues, y_preds = stacker.validate(k=10)\n",
"\n",
"# 精度出力\n",
"cv_results = []\n",
"for y_true, y_pred in zip(y_trues, y_preds):\n",
" cv_result = mean_absolute_error(y_true, y_pred)\n",
" cv_results.append(cv_result)\n",
"print(f'stacking: {np.mean(cv_results):.2f}')\n",
"\n",
"# X_testを使ってpredict\n",
"y_pred = stacker.predict()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Blending"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"# datasetを準備\n",
"dataset = Dataset(X_train, y_train, X_val) # X_testは今回使わないが入れないとエラーになる\n",
"\n",
"# アンサンブルに使うモデルを定義\n",
"models = [\n",
" Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50, 'random_state': seed}, name='rf'),\n",
" Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True}, name='lr'),\n",
" Regressor(dataset=dataset, estimator=KNeighborsRegressor, name='kr'),\n",
" Regressor(dataset=dataset, estimator=CatBoostRegressor, parameters={'custom_metric': ['MAE'], 'random_seed': seed, 'logging_level': 'Silent'}, name='cr')\n",
"]\n",
"\n",
"# pipelineを定義、2nd levelデータセットの作成\n",
"pipeline = ModelsPipeline(*models)\n",
"stack_ds = pipeline.blend(proportion=0.2, seed=seed)\n",
"\n",
"# modelを作ってvalidation\n",
"stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)\n",
"y_trues, y_preds = stacker.validate(k=10)\n",
"\n",
"# 精度出力\n",
"cv_results = []\n",
"for y_true, y_pred in zip(y_trues, y_preds):\n",
" cv_result = mean_absolute_error(y_true, y_pred)\n",
" cv_results.append(cv_result)\n",
"print(f'blending: {np.mean(cv_results):.2f}')\n",
"\n",
"# X_testを使ってpredict\n",
"y_pred = stacker.predict()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Weighted Average"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"# datasetを準備\n",
"dataset = Dataset(X_train, y_train, X_val) # X_testは今回使わないが入れないとエラーになる\n",
"\n",
"# アンサンブルに使うモデルを定義\n",
"models = [\n",
" Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50, 'random_state': seed}, name='rf'),\n",
" Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True}, name='lr'),\n",
" Regressor(dataset=dataset, estimator=KNeighborsRegressor, name='kr'),\n",
" Regressor(dataset=dataset, estimator=CatBoostRegressor, parameters={'custom_metric': ['MAE'], 'random_seed': seed, 'logging_level': 'Silent'}, name='cr')\n",
"]\n",
"\n",
"# pipelineを定義\n",
"pipeline = ModelsPipeline(*models)\n",
"\n",
"# 最適な重みを探索\n",
"weights = pipeline.find_weights(mean_absolute_error)\n",
"pipeline_apply = pipeline.weight(weights)\n",
"\n",
"# 精度を出力\n",
"cv_results = pipeline_apply.validate(scorer=mean_absolute_error, k=10)\n",
"print(f'weighted average: {np.mean(cv_results):.2f}')\n",
"\n",
"# X_testを使ってpredict\n",
"y_pred = pipeline_apply.execute()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# pystacknet"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"from pystacknet.pystacknet import StackNetRegressor\n",
"\n",
"# アンサンブルに使うモデルを定義\n",
"models=[ \n",
" # 1st level\n",
" [\n",
" RandomForestRegressor(n_estimators=50, random_state=seed),\n",
" LinearRegression(normalize=True),\n",
" CatBoostRegressor(custom_metric=['MAE'], random_seed=seed, logging_level='Silent') \n",
" ],\n",
" # 2nd level\n",
" [\n",
" Ridge(normalize=True),\n",
" ExtraTreesRegressor(random_state=seed),\n",
" XGBRegressor(random_state=seed)\n",
" ],\n",
" [\n",
" LinearRegression(normalize=True)\n",
" ]\n",
"]\n",
"\n",
"# StackNetモデルを作成\n",
"model = StackNetRegressor(\n",
" models, folds=10,\n",
" restacking=False, use_retraining=True,\n",
" random_state=seed, n_jobs=-1\n",
")\n",
"\n",
"# 精度出力\n",
"kfold = KFold(n_splits=10, random_state=seed)\n",
"cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=\"neg_mean_absolute_error\")\n",
"print(name, ': ', -np.mean(cv_results))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import matplotlib\n",
"matplotlib.rcParams[\"figure.figsize\"] = (16, 4)\n",
"plt.rcParams[\"font.family\"] = \"IPAexGothic\"\n",
"import logging\n",
"logging.basicConfig(level=logging.INFO)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"x = [49.5, 54.1, 5.37, 64, 691]\n",
"y = [2.26, 2.14, 3.04, 2.16, 2.20]\n",
"labels = ['単純比較', 'Stacking', 'Blending', 'Weighted Average', 'Stacknet']\n",
"\n",
"fig, ax = plt.subplots(figsize=(16, 8))\n",
"ax.scatter(x, y, s=600, c=\"pink\", alpha=0.5)\n",
"\n",
"for i, txt in enumerate(labels):\n",
" if i == 1:\n",
" ax.annotate(txt, (x[i], y[i]-.04), fontsize=20)\n",
" else:\n",
" ax.annotate(txt, (x[i], y[i]+.01), fontsize=15)\n",
"ax.set_ylabel('Absolute mean error', fontsize=20)\n",
"ax.set_xlabel('秒数', fontsize=20)\n",
"ax.set_xscale('log')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment