Skip to content

Instantly share code, notes, and snippets.

@kanjirz50
Created October 15, 2019 07:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kanjirz50/ce2cc8345d93b0d2d6f1500db6eda758 to your computer and use it in GitHub Desktop.
Save kanjirz50/ce2cc8345d93b0d2d6f1500db6eda758 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# AdaBoostの実装"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 使用ライブラリのインポート"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {
"ExecuteTime": {
"end_time": "2019-10-11T09:34:32.349088Z",
"start_time": "2019-10-11T09:34:32.083605Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"import math\n",
"\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from sklearn import datasets\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import accuracy_score\n",
"\n",
"%matplotlib inline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## アルゴリズム評価用のシンプルな決定木"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"ExecuteTime": {
"end_time": "2019-10-11T08:43:58.741173Z",
"start_time": "2019-10-11T08:43:58.737446Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"# アルゴリズム評価に使用する最もシンプルな構造をした決定木(深さ1, ノードが1つと葉が2つ)\n",
"class DecisionStump:\n",
" def __init__(self):\n",
" # {-1, 1}の分類\n",
" self.polarity = 1\n",
" # 何番目の素性に対する決定木かを保持\n",
" self.feature_index = None\n",
" # 素性に対する閾値\n",
" self.threshold = None\n",
" # 分類器の重み(確からしさ、貢献度)\n",
" self.alpha = None"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## AdaBoostアルゴリズム"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {
"ExecuteTime": {
"end_time": "2019-10-11T09:25:17.824414Z",
"start_time": "2019-10-11T09:25:17.719349Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"class AdaBoost:\n",
" def __init__(self, n_clf=5):\n",
" # 弱学習器の数\n",
" self.n_clf = n_clf\n",
" # 学習器\n",
" self.clfs = []\n",
"\n",
" def fit(self, X, y):\n",
" n_samples, n_features = np.shape(X)\n",
" # 重みの初期化\n",
" weights = np.full(n_samples, (1 / n_samples))\n",
" \n",
" self.clfs = []\n",
" for _ in range(self.n_clf):\n",
" # 弱学習器にはシンプルな決定木を利用\n",
" clf = DecisionStump()\n",
" # 無限大をエラーの初期値とする\n",
" min_error = float(\"inf\")\n",
"\n",
" # 良い感じのスレッショルドを探す\n",
" for feature_i in range(n_features):\n",
" # 対象とする素性のうち値はユニーク(素性値に対するスレッショルドであるため) \n",
" feature_values = np.expand_dims(X[:, feature_i], axis=1)\n",
" unique_values = np.unique(feature_values)\n",
"\n",
" for threshold in unique_values:\n",
" p = 1\n",
" # 1としてデフォルト推定値をセット\n",
" prediction = np.ones(np.shape(y))\n",
" # スレッショルドより小さいと-1を設定\n",
" prediction[X[:, feature_i] < threshold] = -1\n",
" # 誤りの数\n",
" error = sum(weights[y != prediction])\n",
" \n",
" # 半分以上間違っているときは、polarityを逆にする(error = 0.8なら逆にするとerror = 0.2のはず)\n",
" if error > 0.5:\n",
" error = 1 - error\n",
" p = -1\n",
"\n",
" # より小さいエラーのパラメータを保存する\n",
" if error < min_error:\n",
" clf.polarity = p\n",
" clf.threshold = threshold\n",
" clf.feature_index = feature_i\n",
" min_error = error\n",
" # この分類器の確からしさを計算\n",
" clf.alpha = 0.5 * math.log((1.0 - min_error) / (min_error + 1e-10))\n",
" \n",
" predictions = np.ones(np.shape(y))\n",
" # 分類器のスレッショルドより下回る場合に、負例とする\n",
" negative_idx = (clf.polarity * X[:, clf.feature_index] < clf.polarity * clf.threshold)\n",
" predictions[negative_idx] = -1\n",
" # 重みの更新。間違ったサンプルはより大きな重みとなる\n",
" weights *= np.exp(-clf.alpha * y * predictions)\n",
" # 重みの正規化\n",
" weights /= np.sum(weights)\n",
"\n",
" self.clfs.append(clf)\n",
"\n",
" def predict(self, X):\n",
" n_samples, _ = np.shape(X)\n",
" y_pred = np.zeros((n_samples, 1))\n",
" for clf in self.clfs:\n",
" predictions = np.ones(np.shape(y_pred))\n",
" negative_idx = (clf.polarity * X[:, clf.feature_index] < clf.polarity * clf.threshold)\n",
" predictions[negative_idx] = -1\n",
" y_pred += clf.alpha * predictions\n",
" \n",
" # -1, 0, 1に書き換える(signum function)\n",
" y_pred = np.sign(y_pred).flatten()\n",
" return y_pred"
]
},
{
"cell_type": "markdown",
"metadata": {
"ExecuteTime": {
"end_time": "2019-10-11T10:03:34.787471Z",
"start_time": "2019-10-11T10:03:34.785445Z"
}
},
"source": [
"## 乳がんデータセットで評価"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {
"ExecuteTime": {
"end_time": "2019-10-11T09:25:18.375776Z",
"start_time": "2019-10-11T09:25:18.363753Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"data = datasets.load_breast_cancer()\n",
"X = data.data\n",
"y = data.target\n",
"\n",
"# -1, 1のラベルに変更\n",
"y[y == 0] = -1\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": 135,
"metadata": {
"ExecuteTime": {
"end_time": "2019-10-11T09:38:33.717179Z",
"start_time": "2019-10-11T09:38:30.914951Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0.9590643274853801"
]
},
"execution_count": 135,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ada_boost = AdaBoost(n_clf=5)\n",
"ada_boost.fit(X_train, y_train)\n",
"pred = ada_boost.predict(X_test)\n",
"accuracy_score(y_test, pred)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1つめのシンプルな決定木での精度"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {
"ExecuteTime": {
"end_time": "2019-10-11T09:31:09.962080Z",
"start_time": "2019-10-11T09:31:09.954934Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"0.8947368421052632"
]
},
"execution_count": 117,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predictions = np.ones(np.shape(y_test))\n",
"\n",
"threshold_for_polarity = ada_boost.clfs[0].polarity * ada_boost.clfs[0].threshold\n",
"feature_polarity = ada_boost.clfs[0].polarity * X_test[:, ada_boost.clfs[0].feature_index]\n",
"negative_idx = (feature_polarity < threshold_for_polarity)\n",
"predictions[negative_idx] = -1\n",
"\n",
"accuracy_score(y_test, predictions)"
]
},
{
"cell_type": "markdown",
"metadata": {
"ExecuteTime": {
"end_time": "2019-10-11T09:31:06.319374Z",
"start_time": "2019-10-11T09:31:06.315886Z"
}
},
"source": [
"## 弱学習器の数による精度の変化"
]
},
{
"cell_type": "code",
"execution_count": 129,
"metadata": {
"ExecuteTime": {
"end_time": "2019-10-11T09:37:19.830872Z",
"start_time": "2019-10-11T09:36:49.852409Z"
},
"collapsed": true
},
"outputs": [],
"source": [
"scores = []\n",
"left = range(1, 11)\n",
"for i in left:\n",
" ada_boost = AdaBoost(n_clf=i)\n",
" ada_boost.fit(X_train, y_train)\n",
" pred = ada_boost.predict(X_test)\n",
" scores.append(accuracy_score(y_test, pred))"
]
},
{
"cell_type": "code",
"execution_count": 132,
"metadata": {
"ExecuteTime": {
"end_time": "2019-10-11T09:38:00.863809Z",
"start_time": "2019-10-11T09:38:00.751491Z"
}
},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plt.plot(left, scores)\n",
"plt.title(\"Increasing Number of Classifer\")\n",
"plt.xlabel(\"Number of Classifier\")\n",
"plt.ylabel(\"Accuracy\")\n",
"plt.grid(True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"hide_input": false,
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
},
"toc": {
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"toc_cell": false,
"toc_position": {},
"toc_section_display": "block",
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment