Skip to content

Instantly share code, notes, and snippets.

@alexpearce
Last active December 19, 2016 16:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alexpearce/2587965cddbe32f6dad2f3497efed470 to your computer and use it in GitHub Desktop.
Save alexpearce/2587965cddbe32f6dad2f3497efed470 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn import model_selection"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"parameter_grid = model_selection.ParameterGrid(dict(max_depth=[2, 3, 4], foo=['bar', 'baz']))\n",
"kfold = model_selection.StratifiedKFold(n_splits=5, random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" <th>label</th>\n",
" <th>w</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>10</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>11</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>12</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>13</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>14</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>5</td>\n",
" <td>15</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>6</td>\n",
" <td>16</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>7</td>\n",
" <td>17</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>8</td>\n",
" <td>18</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>9</td>\n",
" <td>19</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" a b label w\n",
"0 0 10 0 1\n",
"1 1 11 0 0\n",
"2 2 12 0 1\n",
"3 3 13 0 0\n",
"4 4 14 0 0\n",
"5 5 15 1 0\n",
"6 6 16 1 0\n",
"7 7 17 1 0\n",
"8 8 18 1 0\n",
"9 9 19 1 0"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(dict(a=range(10), b=range(10, 20),\n",
" # Weights\n",
" w=np.random.choice([0, 1], size=10),\n",
" # Binary classification labels\n",
" label=np.array([0]*5 + [1]*5)))\n",
"X = df[['a', 'b']]\n",
"y = df.label\n",
"W = df.w\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'foo': 'bar', 'max_depth': 2}\n",
"{'foo': 'bar', 'max_depth': 3}\n",
"{'foo': 'bar', 'max_depth': 4}\n",
"{'foo': 'baz', 'max_depth': 2}\n",
"{'foo': 'baz', 'max_depth': 3}\n",
"{'foo': 'baz', 'max_depth': 4}\n",
"----------------------------------------\n",
"(array([2, 3, 4, 5, 6, 7, 8, 9]), array([0, 1]))\n",
"(array([0, 1, 4, 5, 6, 7, 8, 9]), array([2, 3]))\n",
"(array([0, 1, 2, 3, 6, 7, 8, 9]), array([4, 5]))\n",
"(array([0, 1, 2, 3, 4, 5, 8, 9]), array([6, 7]))\n",
"(array([0, 1, 2, 3, 4, 5, 6, 7]), array([8, 9]))\n"
]
}
],
"source": [
"# Let's see what the grid looks like...\n",
"for parameters in parameter_grid:\n",
" print(parameters)\n",
"\n",
"print('-'*40)\n",
"\n",
"# ...and the k-folding\n",
"for train_index, test_index in kfold.split(X, y):\n",
" print(train_index, test_index)"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'mean_score': [0.53625350385827031,\n",
" 0.38365189527912463,\n",
" 0.55310431548829675,\n",
" 0.44358784922652716,\n",
" 0.54948020780469675,\n",
" 0.68983372669340581],\n",
" 'parameters': [{'foo': 'bar', 'max_depth': 2},\n",
" {'foo': 'bar', 'max_depth': 3},\n",
" {'foo': 'bar', 'max_depth': 4},\n",
" {'foo': 'baz', 'max_depth': 2},\n",
" {'foo': 'baz', 'max_depth': 3},\n",
" {'foo': 'baz', 'max_depth': 4}],\n",
" 'score': [array([ 0.32875334, 0.84342939, 0.18596959, 0.76166583, 0.56144937]),\n",
" array([ 0.04631226, 0.8176691 , 0.03392496, 0.82079794, 0.19955522]),\n",
" array([ 0.6222717 , 0.63061044, 0.30095591, 0.31146542, 0.90021811]),\n",
" array([ 0.18428163, 0.23682483, 0.7379395 , 0.33171623, 0.72717707]),\n",
" array([ 0.64512578, 0.485501 , 0.08040944, 0.89287692, 0.6434879 ]),\n",
" array([ 0.72349646, 0.77137286, 0.74873705, 0.69683843, 0.50872385])],\n",
" 'std_score': [0.24962280085738531,\n",
" 0.36040703124664514,\n",
" 0.22505443364865774,\n",
" 0.24065396682144483,\n",
" 0.26833842592133511,\n",
" 0.093916202727686665]}\n",
"Best score: 0.69 +/- 0.25\n",
"Best parameter set: {'foo': 'bar', 'max_depth': 2}\n"
]
}
],
"source": [
"results = {\n",
" 'parameters': [],\n",
" 'score': [],\n",
" 'mean_score': [],\n",
" 'std_score': []\n",
"}\n",
"\n",
"# A poor-man's GridSearchCV.\n",
"# For every parameter combination:\n",
"# For every k-fold:\n",
"# 1. Fit a classifier on the training data\n",
"# 2. Test the fitted classifier on the testing data\n",
"# 3. Record the metric using the classifier response on the test data\n",
"# 1. Compute averages and variances using the ensemble of scores\n",
"for parameters in parameter_grid:\n",
" results['parameters'].append(parameters)\n",
" scores = []\n",
"\n",
" for train_index, test_index in kfold.split(X, y):\n",
" Xtrain, Xtest = X.iloc[train_index], X.iloc[test_index]\n",
" ytrain, ytest = y.iloc[train_index], y.iloc[test_index]\n",
" Wtrain = W.iloc[train_index]\n",
" Wtest = W.iloc[test_index]\n",
" \n",
" # TODO: probably want to store the clf objects,\n",
" # so we can retrieve the best one later\n",
" # clf = ...\n",
" # clf.fit(...)\n",
" # score = compute_score(clf, Xtest, ytest)\n",
" score = np.random.uniform()\n",
" scores.append(score)\n",
" \n",
" scores = np.array(scores)\n",
" mean = scores.mean()\n",
" std = scores.std()\n",
" \n",
" results['score'].append(scores)\n",
" results['mean_score'].append(mean)\n",
" results['std_score'].append(std)\n",
"\n",
"best_score = np.max(results['mean_score'])\n",
"best_score_idx = np.argmax(results['mean_score'])\n",
"best_score_std = results['std_score'][int(best_score)]\n",
"best_parameters = results['parameters'][int(best_score)]\n",
" \n",
"from pprint import pprint\n",
"pprint(results)\n",
"print('Best score: {0:.2f} +/- {1:.2f}'.format(best_score, best_score_std))\n",
"print('Best parameter set: {0}'.format(best_parameters))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment