Skip to content

Instantly share code, notes, and snippets.

@AfiKhan
Forked from yamasakih/11-Feature_select.ipynb
Created September 14, 2021 19:32
Show Gist options
  • Save AfiKhan/867d47ed78998ba6188ffd01381090db to your computer and use it in GitHub Desktop.
Save AfiKhan/867d47ed78998ba6188ffd01381090db to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from copy import deepcopy\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"from rdkit.Chem import AllChem as Chem, DataStructs\n",
"from rfpimp import importances\n",
"import scipy.sparse as sp\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.feature_selection import f_regression, mutual_info_regression, SelectPercentile, VarianceThreshold\n",
"from sklearn.model_selection import GridSearchCV, ShuffleSplit, train_test_split\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.preprocessing import StandardScaler"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def plot_experimented_and_predicted_value(training_y, predicted_training_y,\n",
" test_y, predicted_test_y, title):\n",
" \n",
" def _plot_experimented_and_predicted_value(experimented_value, predicted_value, title):\n",
" lim = (min(experimented_value.min(), predicted_value.min()),\n",
" max(experimented_value.max(), predicted_value.max()))\n",
" plt.scatter(predicted_value, experimented_value, s=3)\n",
" plt.plot(lim, lim, c='red', alpha=0.5)\n",
" plt.xlim(lim)\n",
" plt.ylim(lim)\n",
" plt.xlabel('Predicted value', fontsize=16)\n",
" plt.ylabel('Experimented value', fontsize=16)\n",
" plt.title(title, fontsize=16)\n",
" \n",
" plt.figure(figsize=(11, 5))\n",
" plt.subplot(1, 2, 1)\n",
" _plot_experimented_and_predicted_value(training_y, predicted_training_y, f'{title}: Training set')\n",
" plt.subplot(1, 2, 2)\n",
" _plot_experimented_and_predicted_value(test_y, predicted_test_y, f'{title}: Test set')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"input = 'logSdataset1290_2d.sdf'\n",
"random_state = 20150917"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"supp = Chem.SDMolSupplier(input)\n",
"mols = [mol for mol in supp if mol]\n",
"y = np.array([float(mol.GetProp('logS')) for mol in mols])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1290"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(mols)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"n_bits = 2**13 #8192"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"fps = [Chem.GetMorganFingerprintAsBitVect(mol=mol, radius=2, nBits=n_bits, useFeatures=False) for mol in mols]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"X = np.empty((len(mols), n_bits))\n",
"for i, fp in enumerate(fps):\n",
" tmp = np.array([], )\n",
" DataStructs.ConvertToNumpyArray(fp, tmp)\n",
" X[i, :] = tmp"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1290, 8192)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X.shape"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"X = pd.DataFrame(X, columns=[f'Bit{i}' for i in np.arange(X.shape[1])])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"training_X, test_X, training_y, test_y = train_test_split(\n",
" X, y, test_size=0.2, random_state=random_state, shuffle=True)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"pipeline = make_pipeline(\n",
" VarianceThreshold(),\n",
" SelectPercentile(),\n",
" StandardScaler(with_mean=False, with_std=True),\n",
" RandomForestRegressor(n_estimators=500, n_jobs=-1, random_state=random_state)\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"param_grid = {\n",
" 'variancethreshold__threshold': np.arange(0.0, 0.1, 0.02),\n",
" 'selectpercentile__percentile': np.append(np.arange(5, 101, 10), 100),\n",
" 'selectpercentile__score_func' : [f_regression, mutual_info_regression],\n",
" 'randomforestregressor__n_estimators': np.arange(300, 1501, 300),\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fitting 3 folds for each of 550 candidates, totalling 1650 fits\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n",
"[Parallel(n_jobs=-1)]: Done 25 tasks | elapsed: 10.1s\n",
"[Parallel(n_jobs=-1)]: Done 146 tasks | elapsed: 2.5min\n",
"/Users/yamasakih/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/externals/loky/process_executor.py:700: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.\n",
" \"timeout or by a memory leak.\", UserWarning\n",
"[Parallel(n_jobs=-1)]: Done 349 tasks | elapsed: 7.4min\n",
"[Parallel(n_jobs=-1)]: Done 632 tasks | elapsed: 16.4min\n",
"[Parallel(n_jobs=-1)]: Done 997 tasks | elapsed: 30.8min\n",
"[Parallel(n_jobs=-1)]: Done 1442 tasks | elapsed: 51.7min\n",
"[Parallel(n_jobs=-1)]: Done 1650 out of 1650 | elapsed: 67.4min finished\n"
]
},
{
"data": {
"text/plain": [
"GridSearchCV(cv=ShuffleSplit(n_splits=3, random_state=0, test_size=0.3, train_size=None),\n",
" error_score='raise-deprecating',\n",
" estimator=Pipeline(memory=None,\n",
" steps=[('variancethreshold', VarianceThreshold(threshold=0.0)), ('selectpercentile', SelectPercentile(percentile=10,\n",
" score_func=<function f_classif at 0x1a195f9048>)), ('standardscaler', StandardScaler(copy=True, with_mean=False, with_std=True)), ('randomforestregressor', RandomForestRegres...obs=-1,\n",
" oob_score=False, random_state=20150917, verbose=0,\n",
" warm_start=False))]),\n",
" fit_params=None, iid='warn', n_jobs=-1,\n",
" param_grid={'variancethreshold__threshold': array([0. , 0.02, 0.04, 0.06, 0.08]), 'selectpercentile__percentile': array([ 5, 15, 25, 35, 45, 55, 65, 75, 85, 95, 100]), 'selectpercentile__score_func': [<function f_regression at 0x1a195f91e0>, <function mutual_info_regression at 0x1a1960e158>], 'randomforestregressor__n_estimators': array([ 300, 600, 900, 1200, 1500])},\n",
" pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n",
" scoring='neg_mean_absolute_error', verbose=2)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"shuffle_split = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0)\n",
"gs = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=shuffle_split, return_train_score=True,\n",
" scoring='neg_mean_absolute_error', n_jobs=-1, verbose=2)\n",
"gs.fit(training_X, training_y)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Pipeline(memory=None,\n",
" steps=[('variancethreshold', VarianceThreshold(threshold=0.0)), ('selectpercentile', SelectPercentile(percentile=95,\n",
" score_func=<function f_regression at 0x1a195f91e0>)), ('standardscaler', StandardScaler(copy=True, with_mean=False, with_std=True)), ('randomforestregressor', RandomForestReg...obs=-1,\n",
" oob_score=False, random_state=20150917, verbose=0,\n",
" warm_start=False))])"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gs.best_estimator_"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>mean_fit_time</th>\n",
" <th>std_fit_time</th>\n",
" <th>mean_score_time</th>\n",
" <th>std_score_time</th>\n",
" <th>param_randomforestregressor__n_estimators</th>\n",
" <th>param_selectpercentile__percentile</th>\n",
" <th>param_selectpercentile__score_func</th>\n",
" <th>param_variancethreshold__threshold</th>\n",
" <th>params</th>\n",
" <th>split0_test_score</th>\n",
" <th>split1_test_score</th>\n",
" <th>split2_test_score</th>\n",
" <th>mean_test_score</th>\n",
" <th>std_test_score</th>\n",
" <th>rank_test_score</th>\n",
" <th>split0_train_score</th>\n",
" <th>split1_train_score</th>\n",
" <th>split2_train_score</th>\n",
" <th>mean_train_score</th>\n",
" <th>std_train_score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>2.340794</td>\n",
" <td>0.300037</td>\n",
" <td>0.130464</td>\n",
" <td>0.005036</td>\n",
" <td>300</td>\n",
" <td>15</td>\n",
" <td>&lt;function f_regression at 0x1a195f91e0&gt;</td>\n",
" <td>0</td>\n",
" <td>{'randomforestregressor__n_estimators': 300, '...</td>\n",
" <td>-0.943628</td>\n",
" <td>-0.871986</td>\n",
" <td>-0.842301</td>\n",
" <td>-0.885972</td>\n",
" <td>0.042532</td>\n",
" <td>19</td>\n",
" <td>-0.370619</td>\n",
" <td>-0.375834</td>\n",
" <td>-0.382988</td>\n",
" <td>-0.376480</td>\n",
" <td>0.005070</td>\n",
" </tr>\n",
" <tr>\n",
" <th>90</th>\n",
" <td>10.528222</td>\n",
" <td>0.940662</td>\n",
" <td>0.151243</td>\n",
" <td>0.001575</td>\n",
" <td>300</td>\n",
" <td>95</td>\n",
" <td>&lt;function f_regression at 0x1a195f91e0&gt;</td>\n",
" <td>0</td>\n",
" <td>{'randomforestregressor__n_estimators': 300, '...</td>\n",
" <td>-0.938614</td>\n",
" <td>-0.847938</td>\n",
" <td>-0.863170</td>\n",
" <td>-0.883240</td>\n",
" <td>0.039645</td>\n",
" <td>5</td>\n",
" <td>-0.344665</td>\n",
" <td>-0.346377</td>\n",
" <td>-0.351231</td>\n",
" <td>-0.347424</td>\n",
" <td>0.002781</td>\n",
" </tr>\n",
" <tr>\n",
" <th>120</th>\n",
" <td>3.105771</td>\n",
" <td>0.218041</td>\n",
" <td>0.234040</td>\n",
" <td>0.007595</td>\n",
" <td>600</td>\n",
" <td>15</td>\n",
" <td>&lt;function f_regression at 0x1a195f91e0&gt;</td>\n",
" <td>0</td>\n",
" <td>{'randomforestregressor__n_estimators': 600, '...</td>\n",
" <td>-0.943684</td>\n",
" <td>-0.871894</td>\n",
" <td>-0.841502</td>\n",
" <td>-0.885694</td>\n",
" <td>0.042841</td>\n",
" <td>14</td>\n",
" <td>-0.368714</td>\n",
" <td>-0.375645</td>\n",
" <td>-0.382265</td>\n",
" <td>-0.375541</td>\n",
" <td>0.005532</td>\n",
" </tr>\n",
" <tr>\n",
" <th>170</th>\n",
" <td>13.607919</td>\n",
" <td>1.549001</td>\n",
" <td>0.242282</td>\n",
" <td>0.004673</td>\n",
" <td>600</td>\n",
" <td>65</td>\n",
" <td>&lt;function f_regression at 0x1a195f91e0&gt;</td>\n",
" <td>0</td>\n",
" <td>{'randomforestregressor__n_estimators': 600, '...</td>\n",
" <td>-0.937537</td>\n",
" <td>-0.859764</td>\n",
" <td>-0.858828</td>\n",
" <td>-0.885376</td>\n",
" <td>0.036885</td>\n",
" <td>9</td>\n",
" <td>-0.341022</td>\n",
" <td>-0.347568</td>\n",
" <td>-0.349905</td>\n",
" <td>-0.346165</td>\n",
" <td>0.003760</td>\n",
" </tr>\n",
" <tr>\n",
" <th>180</th>\n",
" <td>18.404103</td>\n",
" <td>4.982111</td>\n",
" <td>0.237588</td>\n",
" <td>0.006530</td>\n",
" <td>600</td>\n",
" <td>75</td>\n",
" <td>&lt;function f_regression at 0x1a195f91e0&gt;</td>\n",
" <td>0</td>\n",
" <td>{'randomforestregressor__n_estimators': 600, '...</td>\n",
" <td>-0.942826</td>\n",
" <td>-0.857595</td>\n",
" <td>-0.856221</td>\n",
" <td>-0.885547</td>\n",
" <td>0.040506</td>\n",
" <td>11</td>\n",
" <td>-0.342478</td>\n",
" <td>-0.347106</td>\n",
" <td>-0.349115</td>\n",
" <td>-0.346233</td>\n",
" <td>0.002779</td>\n",
" </tr>\n",
" <tr>\n",
" <th>190</th>\n",
" <td>21.924798</td>\n",
" <td>6.057214</td>\n",
" <td>0.249807</td>\n",
" <td>0.009105</td>\n",
" <td>600</td>\n",
" <td>85</td>\n",
" <td>&lt;function f_regression at 0x1a195f91e0&gt;</td>\n",
" <td>0</td>\n",
" <td>{'randomforestregressor__n_estimators': 600, '...</td>\n",
" <td>-0.939885</td>\n",
" <td>-0.856279</td>\n",
" <td>-0.860871</td>\n",
" <td>-0.885678</td>\n",
" <td>0.038376</td>\n",
" <td>13</td>\n",
" <td>-0.340665</td>\n",
" <td>-0.346452</td>\n",
" <td>-0.348677</td>\n",
" <td>-0.345265</td>\n",
" <td>0.003377</td>\n",
" </tr>\n",
" <tr>\n",
" <th>200</th>\n",
" <td>26.032370</td>\n",
" <td>5.246677</td>\n",
" <td>0.252347</td>\n",
" <td>0.007084</td>\n",
" <td>600</td>\n",
" <td>95</td>\n",
" <td>&lt;function f_regression at 0x1a195f91e0&gt;</td>\n",
" <td>0</td>\n",
" <td>{'randomforestregressor__n_estimators': 600, '...</td>\n",
" <td>-0.937249</td>\n",
" <td>-0.850122</td>\n",
" <td>-0.858931</td>\n",
" <td>-0.882101</td>\n",
" <td>0.039161</td>\n",
" <td>2</td>\n",
" <td>-0.341842</td>\n",
" <td>-0.346288</td>\n",
" <td>-0.349317</td>\n",
" <td>-0.345815</td>\n",
" <td>0.003070</td>\n",
" </tr>\n",
" <tr>\n",
" <th>205</th>\n",
" <td>135.622791</td>\n",
" <td>1.864940</td>\n",
" <td>0.249126</td>\n",
" <td>0.011696</td>\n",
" <td>600</td>\n",
" <td>95</td>\n",
" <td>&lt;function mutual_info_regression at 0x1a1960e158&gt;</td>\n",
" <td>0</td>\n",
" <td>{'randomforestregressor__n_estimators': 600, '...</td>\n",
" <td>-0.947186</td>\n",
" <td>-0.851216</td>\n",
" <td>-0.859047</td>\n",
" <td>-0.885817</td>\n",
" <td>0.043512</td>\n",
" <td>16</td>\n",
" <td>-0.341680</td>\n",
" <td>-0.345807</td>\n",
" <td>-0.350339</td>\n",
" <td>-0.345942</td>\n",
" <td>0.003536</td>\n",
" </tr>\n",
" <tr>\n",
" <th>230</th>\n",
" <td>4.605038</td>\n",
" <td>0.278365</td>\n",
" <td>0.238162</td>\n",
" <td>0.004040</td>\n",
" <td>900</td>\n",
" <td>15</td>\n",
" <td>&lt;function f_regression at 0x1a195f91e0&gt;</td>\n",
" <td>0</td>\n",
" <td>{'randomforestregressor__n_estimators': 900, '...</td>\n",
" <td>-0.946329</td>\n",
" <td>-0.870596</td>\n",
" <td>-0.838679</td>\n",
" <td>-0.885201</td>\n",
" <td>0.045145</td>\n",
" <td>8</td>\n",
" <td>-0.369157</td>\n",
" <td>-0.376066</td>\n",
" <td>-0.382569</td>\n",
" <td>-0.375931</td>\n",
" <td>0.005476</td>\n",
" </tr>\n",
" <tr>\n",
" <th>310</th>\n",
" <td>50.707395</td>\n",
" <td>8.854649</td>\n",
" <td>0.245704</td>\n",
" <td>0.006478</td>\n",
" <td>900</td>\n",
" <td>95</td>\n",
" <td>&lt;function f_regression at 0x1a195f91e0&gt;</td>\n",
" <td>0</td>\n",
" <td>{'randomforestregressor__n_estimators': 900, '...</td>\n",
" <td>-0.939194</td>\n",
" <td>-0.852424</td>\n",
" <td>-0.857810</td>\n",
" <td>-0.883143</td>\n",
" <td>0.039695</td>\n",
" <td>4</td>\n",
" <td>-0.342567</td>\n",
" <td>-0.347138</td>\n",
" <td>-0.349058</td>\n",
" <td>-0.346254</td>\n",
" <td>0.002723</td>\n",
" </tr>\n",
" <tr>\n",
" <th>340</th>\n",
" <td>7.340413</td>\n",
" <td>1.605647</td>\n",
" <td>0.347139</td>\n",
" <td>0.009792</td>\n",
" <td>1200</td>\n",
" <td>15</td>\n",
" <td>&lt;function f_regression at 0x1a195f91e0&gt;</td>\n",
" <td>0</td>\n",
" <td>{'randomforestregressor__n_estimators': 1200, ...</td>\n",
" <td>-0.944195</td>\n",
" <td>-0.871864</td>\n",
" <td>-0.838514</td>\n",
" <td>-0.884858</td>\n",
" <td>0.044112</td>\n",
" <td>7</td>\n",
" <td>-0.368879</td>\n",
" <td>-0.377187</td>\n",
" <td>-0.382811</td>\n",
" <td>-0.376293</td>\n",
" <td>0.005723</td>\n",
" </tr>\n",
" <tr>\n",
" <th>410</th>\n",
" <td>59.967925</td>\n",
" <td>3.010375</td>\n",
" <td>0.346323</td>\n",
" <td>0.006296</td>\n",
" <td>1200</td>\n",
" <td>85</td>\n",
" <td>&lt;function f_regression at 0x1a195f91e0&gt;</td>\n",
" <td>0</td>\n",
" <td>{'randomforestregressor__n_estimators': 1200, ...</td>\n",
" <td>-0.938870</td>\n",
" <td>-0.858421</td>\n",
" <td>-0.860480</td>\n",
" <td>-0.885924</td>\n",
" <td>0.037448</td>\n",
" <td>18</td>\n",
" <td>-0.341874</td>\n",
" <td>-0.347112</td>\n",
" <td>-0.347865</td>\n",
" <td>-0.345617</td>\n",
" <td>0.002665</td>\n",
" </tr>\n",
" <tr>\n",
" <th>420</th>\n",
" <td>67.232915</td>\n",
" <td>5.321867</td>\n",
" <td>0.356481</td>\n",
" <td>0.005594</td>\n",
" <td>1200</td>\n",
" <td>95</td>\n",
" <td>&lt;function f_regression at 0x1a195f91e0&gt;</td>\n",
" <td>0</td>\n",
" <td>{'randomforestregressor__n_estimators': 1200, ...</td>\n",
" <td>-0.937423</td>\n",
" <td>-0.852502</td>\n",
" <td>-0.856930</td>\n",
" <td>-0.882285</td>\n",
" <td>0.039031</td>\n",
" <td>3</td>\n",
" <td>-0.342715</td>\n",
" <td>-0.347692</td>\n",
" <td>-0.348595</td>\n",
" <td>-0.346334</td>\n",
" <td>0.002586</td>\n",
" </tr>\n",
" <tr>\n",
" <th>450</th>\n",
" <td>16.274444</td>\n",
" <td>0.806114</td>\n",
" <td>0.346147</td>\n",
" <td>0.008797</td>\n",
" <td>1500</td>\n",
" <td>15</td>\n",
" <td>&lt;function f_regression at 0x1a195f91e0&gt;</td>\n",
" <td>0</td>\n",
" <td>{'randomforestregressor__n_estimators': 1500, ...</td>\n",
" <td>-0.945321</td>\n",
" <td>-0.870367</td>\n",
" <td>-0.837342</td>\n",
" <td>-0.884343</td>\n",
" <td>0.045176</td>\n",
" <td>6</td>\n",
" <td>-0.368719</td>\n",
" <td>-0.377324</td>\n",
" <td>-0.382715</td>\n",
" <td>-0.376253</td>\n",
" <td>0.005764</td>\n",
" </tr>\n",
" <tr>\n",
" <th>500</th>\n",
" <td>54.720098</td>\n",
" <td>0.352229</td>\n",
" <td>0.413356</td>\n",
" <td>0.052451</td>\n",
" <td>1500</td>\n",
" <td>65</td>\n",
" <td>&lt;function f_regression at 0x1a195f91e0&gt;</td>\n",
" <td>0</td>\n",
" <td>{'randomforestregressor__n_estimators': 1500, ...</td>\n",
" <td>-0.942593</td>\n",
" <td>-0.858608</td>\n",
" <td>-0.855953</td>\n",
" <td>-0.885718</td>\n",
" <td>0.040231</td>\n",
" <td>15</td>\n",
" <td>-0.341795</td>\n",
" <td>-0.348258</td>\n",
" <td>-0.349553</td>\n",
" <td>-0.346535</td>\n",
" <td>0.003393</td>\n",
" </tr>\n",
" <tr>\n",
" <th>510</th>\n",
" <td>64.882490</td>\n",
" <td>3.607211</td>\n",
" <td>0.380268</td>\n",
" <td>0.052067</td>\n",
" <td>1500</td>\n",
" <td>75</td>\n",
" <td>&lt;function f_regression at 0x1a195f91e0&gt;</td>\n",
" <td>0</td>\n",
" <td>{'randomforestregressor__n_estimators': 1500, ...</td>\n",
" <td>-0.944586</td>\n",
" <td>-0.856216</td>\n",
" <td>-0.855952</td>\n",
" <td>-0.885585</td>\n",
" <td>0.041720</td>\n",
" <td>12</td>\n",
" <td>-0.342776</td>\n",
" <td>-0.348251</td>\n",
" <td>-0.348567</td>\n",
" <td>-0.346531</td>\n",
" <td>0.002659</td>\n",
" </tr>\n",
" <tr>\n",
" <th>520</th>\n",
" <td>73.660243</td>\n",
" <td>2.588436</td>\n",
" <td>0.453762</td>\n",
" <td>0.004975</td>\n",
" <td>1500</td>\n",
" <td>85</td>\n",
" <td>&lt;function f_regression at 0x1a195f91e0&gt;</td>\n",
" <td>0</td>\n",
" <td>{'randomforestregressor__n_estimators': 1500, ...</td>\n",
" <td>-0.938843</td>\n",
" <td>-0.857540</td>\n",
" <td>-0.860013</td>\n",
" <td>-0.885465</td>\n",
" <td>0.037757</td>\n",
" <td>10</td>\n",
" <td>-0.341996</td>\n",
" <td>-0.347487</td>\n",
" <td>-0.348140</td>\n",
" <td>-0.345874</td>\n",
" <td>0.002755</td>\n",
" </tr>\n",
" <tr>\n",
" <th>530</th>\n",
" <td>80.701550</td>\n",
" <td>4.365979</td>\n",
" <td>0.455875</td>\n",
" <td>0.003979</td>\n",
" <td>1500</td>\n",
" <td>95</td>\n",
" <td>&lt;function f_regression at 0x1a195f91e0&gt;</td>\n",
" <td>0</td>\n",
" <td>{'randomforestregressor__n_estimators': 1500, ...</td>\n",
" <td>-0.938691</td>\n",
" <td>-0.850485</td>\n",
" <td>-0.856768</td>\n",
" <td>-0.881982</td>\n",
" <td>0.040182</td>\n",
" <td>1</td>\n",
" <td>-0.342474</td>\n",
" <td>-0.347482</td>\n",
" <td>-0.348567</td>\n",
" <td>-0.346174</td>\n",
" <td>0.002654</td>\n",
" </tr>\n",
" <tr>\n",
" <th>535</th>\n",
" <td>255.620373</td>\n",
" <td>0.894018</td>\n",
" <td>0.381099</td>\n",
" <td>0.045727</td>\n",
" <td>1500</td>\n",
" <td>95</td>\n",
" <td>&lt;function mutual_info_regression at 0x1a1960e158&gt;</td>\n",
" <td>0</td>\n",
" <td>{'randomforestregressor__n_estimators': 1500, ...</td>\n",
" <td>-0.949073</td>\n",
" <td>-0.850371</td>\n",
" <td>-0.858187</td>\n",
" <td>-0.885877</td>\n",
" <td>0.044800</td>\n",
" <td>17</td>\n",
" <td>-0.341987</td>\n",
" <td>-0.346581</td>\n",
" <td>-0.348844</td>\n",
" <td>-0.345804</td>\n",
" <td>0.002853</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" mean_fit_time std_fit_time mean_score_time std_score_time \\\n",
"10 2.340794 0.300037 0.130464 0.005036 \n",
"90 10.528222 0.940662 0.151243 0.001575 \n",
"120 3.105771 0.218041 0.234040 0.007595 \n",
"170 13.607919 1.549001 0.242282 0.004673 \n",
"180 18.404103 4.982111 0.237588 0.006530 \n",
"190 21.924798 6.057214 0.249807 0.009105 \n",
"200 26.032370 5.246677 0.252347 0.007084 \n",
"205 135.622791 1.864940 0.249126 0.011696 \n",
"230 4.605038 0.278365 0.238162 0.004040 \n",
"310 50.707395 8.854649 0.245704 0.006478 \n",
"340 7.340413 1.605647 0.347139 0.009792 \n",
"410 59.967925 3.010375 0.346323 0.006296 \n",
"420 67.232915 5.321867 0.356481 0.005594 \n",
"450 16.274444 0.806114 0.346147 0.008797 \n",
"500 54.720098 0.352229 0.413356 0.052451 \n",
"510 64.882490 3.607211 0.380268 0.052067 \n",
"520 73.660243 2.588436 0.453762 0.004975 \n",
"530 80.701550 4.365979 0.455875 0.003979 \n",
"535 255.620373 0.894018 0.381099 0.045727 \n",
"\n",
" param_randomforestregressor__n_estimators \\\n",
"10 300 \n",
"90 300 \n",
"120 600 \n",
"170 600 \n",
"180 600 \n",
"190 600 \n",
"200 600 \n",
"205 600 \n",
"230 900 \n",
"310 900 \n",
"340 1200 \n",
"410 1200 \n",
"420 1200 \n",
"450 1500 \n",
"500 1500 \n",
"510 1500 \n",
"520 1500 \n",
"530 1500 \n",
"535 1500 \n",
"\n",
" param_selectpercentile__percentile \\\n",
"10 15 \n",
"90 95 \n",
"120 15 \n",
"170 65 \n",
"180 75 \n",
"190 85 \n",
"200 95 \n",
"205 95 \n",
"230 15 \n",
"310 95 \n",
"340 15 \n",
"410 85 \n",
"420 95 \n",
"450 15 \n",
"500 65 \n",
"510 75 \n",
"520 85 \n",
"530 95 \n",
"535 95 \n",
"\n",
" param_selectpercentile__score_func \\\n",
"10 <function f_regression at 0x1a195f91e0> \n",
"90 <function f_regression at 0x1a195f91e0> \n",
"120 <function f_regression at 0x1a195f91e0> \n",
"170 <function f_regression at 0x1a195f91e0> \n",
"180 <function f_regression at 0x1a195f91e0> \n",
"190 <function f_regression at 0x1a195f91e0> \n",
"200 <function f_regression at 0x1a195f91e0> \n",
"205 <function mutual_info_regression at 0x1a1960e158> \n",
"230 <function f_regression at 0x1a195f91e0> \n",
"310 <function f_regression at 0x1a195f91e0> \n",
"340 <function f_regression at 0x1a195f91e0> \n",
"410 <function f_regression at 0x1a195f91e0> \n",
"420 <function f_regression at 0x1a195f91e0> \n",
"450 <function f_regression at 0x1a195f91e0> \n",
"500 <function f_regression at 0x1a195f91e0> \n",
"510 <function f_regression at 0x1a195f91e0> \n",
"520 <function f_regression at 0x1a195f91e0> \n",
"530 <function f_regression at 0x1a195f91e0> \n",
"535 <function mutual_info_regression at 0x1a1960e158> \n",
"\n",
" param_variancethreshold__threshold \\\n",
"10 0 \n",
"90 0 \n",
"120 0 \n",
"170 0 \n",
"180 0 \n",
"190 0 \n",
"200 0 \n",
"205 0 \n",
"230 0 \n",
"310 0 \n",
"340 0 \n",
"410 0 \n",
"420 0 \n",
"450 0 \n",
"500 0 \n",
"510 0 \n",
"520 0 \n",
"530 0 \n",
"535 0 \n",
"\n",
" params split0_test_score \\\n",
"10 {'randomforestregressor__n_estimators': 300, '... -0.943628 \n",
"90 {'randomforestregressor__n_estimators': 300, '... -0.938614 \n",
"120 {'randomforestregressor__n_estimators': 600, '... -0.943684 \n",
"170 {'randomforestregressor__n_estimators': 600, '... -0.937537 \n",
"180 {'randomforestregressor__n_estimators': 600, '... -0.942826 \n",
"190 {'randomforestregressor__n_estimators': 600, '... -0.939885 \n",
"200 {'randomforestregressor__n_estimators': 600, '... -0.937249 \n",
"205 {'randomforestregressor__n_estimators': 600, '... -0.947186 \n",
"230 {'randomforestregressor__n_estimators': 900, '... -0.946329 \n",
"310 {'randomforestregressor__n_estimators': 900, '... -0.939194 \n",
"340 {'randomforestregressor__n_estimators': 1200, ... -0.944195 \n",
"410 {'randomforestregressor__n_estimators': 1200, ... -0.938870 \n",
"420 {'randomforestregressor__n_estimators': 1200, ... -0.937423 \n",
"450 {'randomforestregressor__n_estimators': 1500, ... -0.945321 \n",
"500 {'randomforestregressor__n_estimators': 1500, ... -0.942593 \n",
"510 {'randomforestregressor__n_estimators': 1500, ... -0.944586 \n",
"520 {'randomforestregressor__n_estimators': 1500, ... -0.938843 \n",
"530 {'randomforestregressor__n_estimators': 1500, ... -0.938691 \n",
"535 {'randomforestregressor__n_estimators': 1500, ... -0.949073 \n",
"\n",
" split1_test_score split2_test_score mean_test_score std_test_score \\\n",
"10 -0.871986 -0.842301 -0.885972 0.042532 \n",
"90 -0.847938 -0.863170 -0.883240 0.039645 \n",
"120 -0.871894 -0.841502 -0.885694 0.042841 \n",
"170 -0.859764 -0.858828 -0.885376 0.036885 \n",
"180 -0.857595 -0.856221 -0.885547 0.040506 \n",
"190 -0.856279 -0.860871 -0.885678 0.038376 \n",
"200 -0.850122 -0.858931 -0.882101 0.039161 \n",
"205 -0.851216 -0.859047 -0.885817 0.043512 \n",
"230 -0.870596 -0.838679 -0.885201 0.045145 \n",
"310 -0.852424 -0.857810 -0.883143 0.039695 \n",
"340 -0.871864 -0.838514 -0.884858 0.044112 \n",
"410 -0.858421 -0.860480 -0.885924 0.037448 \n",
"420 -0.852502 -0.856930 -0.882285 0.039031 \n",
"450 -0.870367 -0.837342 -0.884343 0.045176 \n",
"500 -0.858608 -0.855953 -0.885718 0.040231 \n",
"510 -0.856216 -0.855952 -0.885585 0.041720 \n",
"520 -0.857540 -0.860013 -0.885465 0.037757 \n",
"530 -0.850485 -0.856768 -0.881982 0.040182 \n",
"535 -0.850371 -0.858187 -0.885877 0.044800 \n",
"\n",
" rank_test_score split0_train_score split1_train_score \\\n",
"10 19 -0.370619 -0.375834 \n",
"90 5 -0.344665 -0.346377 \n",
"120 14 -0.368714 -0.375645 \n",
"170 9 -0.341022 -0.347568 \n",
"180 11 -0.342478 -0.347106 \n",
"190 13 -0.340665 -0.346452 \n",
"200 2 -0.341842 -0.346288 \n",
"205 16 -0.341680 -0.345807 \n",
"230 8 -0.369157 -0.376066 \n",
"310 4 -0.342567 -0.347138 \n",
"340 7 -0.368879 -0.377187 \n",
"410 18 -0.341874 -0.347112 \n",
"420 3 -0.342715 -0.347692 \n",
"450 6 -0.368719 -0.377324 \n",
"500 15 -0.341795 -0.348258 \n",
"510 12 -0.342776 -0.348251 \n",
"520 10 -0.341996 -0.347487 \n",
"530 1 -0.342474 -0.347482 \n",
"535 17 -0.341987 -0.346581 \n",
"\n",
" split2_train_score mean_train_score std_train_score \n",
"10 -0.382988 -0.376480 0.005070 \n",
"90 -0.351231 -0.347424 0.002781 \n",
"120 -0.382265 -0.375541 0.005532 \n",
"170 -0.349905 -0.346165 0.003760 \n",
"180 -0.349115 -0.346233 0.002779 \n",
"190 -0.348677 -0.345265 0.003377 \n",
"200 -0.349317 -0.345815 0.003070 \n",
"205 -0.350339 -0.345942 0.003536 \n",
"230 -0.382569 -0.375931 0.005476 \n",
"310 -0.349058 -0.346254 0.002723 \n",
"340 -0.382811 -0.376293 0.005723 \n",
"410 -0.347865 -0.345617 0.002665 \n",
"420 -0.348595 -0.346334 0.002586 \n",
"450 -0.382715 -0.376253 0.005764 \n",
"500 -0.349553 -0.346535 0.003393 \n",
"510 -0.348567 -0.346531 0.002659 \n",
"520 -0.348140 -0.345874 0.002755 \n",
"530 -0.348567 -0.346174 0.002654 \n",
"535 -0.348844 -0.345804 0.002853 "
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cv_results = pd.DataFrame(gs.cv_results_)\n",
"cv_results[cv_results.rank_test_score<20]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"estimator = gs.best_estimator_"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training:\t0.95\n",
"test:\t0.73\n"
]
}
],
"source": [
"print(f'Training:\\t{estimator.score(training_X, training_y):.2}')\n",
"print(f'test:\\t{estimator.score(test_X, test_y):.2}')"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 792x360 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plot_experimented_and_predicted_value(training_y, estimator.predict(training_X),\n",
" test_y, estimator.predict(test_X), title='All features')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"imp = importances(estimator, test_X, test_y, metric='neg_mean_absolute_error', n_samples=-1)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Importance</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Feature</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Bit561</th>\n",
" <td>0.432699</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Bit4903</th>\n",
" <td>0.098274</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Bit5476</th>\n",
" <td>0.096905</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Bit6794</th>\n",
" <td>0.061612</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Bit2400</th>\n",
" <td>0.044542</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Importance\n",
"Feature \n",
"Bit561 0.432699\n",
"Bit4903 0.098274\n",
"Bit5476 0.096905\n",
"Bit6794 0.061612\n",
"Bit2400 0.044542"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"imp.head()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"important_training_X = training_X.reindex(columns=imp[imp.Importance>0].index)\n",
"important_test_X = test_X.reindex(columns=imp[imp.Importance>0].index)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"copyed_estimator = deepcopy(estimator)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Pipeline(memory=None,\n",
" steps=[('variancethreshold', VarianceThreshold(threshold=0.0)), ('selectpercentile', SelectPercentile(percentile=95,\n",
" score_func=<function f_regression at 0x1a195f91e0>)), ('standardscaler', StandardScaler(copy=True, with_mean=False, with_std=True)), ('randomforestregressor', RandomForestReg...obs=-1,\n",
" oob_score=False, random_state=20150917, verbose=0,\n",
" warm_start=False))])"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"copyed_estimator.fit(sp.csr_matrix(important_training_X), training_y)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training:\t0.95\n",
"test:\t0.77\n"
]
}
],
"source": [
"print(f'Training:\\t{copyed_estimator.score(important_training_X, training_y):.2}')\n",
"print(f'test:\\t{copyed_estimator.score(important_test_X, test_y):.2}')"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1032, 640)"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"important_training_X.shape"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 792x360 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plot_experimented_and_predicted_value(training_y, copyed_estimator.predict(important_training_X),\n",
" test_y, copyed_estimator.predict(important_test_X), title='Important features')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# E0F "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment