alexpearce/gridsearch.ipynb

## gridsearch.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn import model_selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "parameter_grid = model_selection.ParameterGrid(dict(max_depth=[2, 3, 4], foo=['bar', 'baz']))\n",
    "kfold = model_selection.StratifiedKFold(n_splits=5, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>label</th>\n",
       "      <th>w</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>11</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>12</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>13</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>14</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>5</td>\n",
       "      <td>15</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>6</td>\n",
       "      <td>16</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>7</td>\n",
       "      <td>17</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>8</td>\n",
       "      <td>18</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>9</td>\n",
       "      <td>19</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   a   b  label  w\n",
       "0  0  10      0  1\n",
       "1  1  11      0  0\n",
       "2  2  12      0  1\n",
       "3  3  13      0  0\n",
       "4  4  14      0  0\n",
       "5  5  15      1  0\n",
       "6  6  16      1  0\n",
       "7  7  17      1  0\n",
       "8  8  18      1  0\n",
       "9  9  19      1  0"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(dict(a=range(10), b=range(10, 20),\n",
    "                       # Weights\n",
    "                       w=np.random.choice([0, 1], size=10),\n",
    "                       # Binary classification labels\n",
    "                       label=np.array([0]*5 + [1]*5)))\n",
    "X = df[['a', 'b']]\n",
    "y = df.label\n",
    "W = df.w\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'foo': 'bar', 'max_depth': 2}\n",
      "{'foo': 'bar', 'max_depth': 3}\n",
      "{'foo': 'bar', 'max_depth': 4}\n",
      "{'foo': 'baz', 'max_depth': 2}\n",
      "{'foo': 'baz', 'max_depth': 3}\n",
      "{'foo': 'baz', 'max_depth': 4}\n",
      "----------------------------------------\n",
      "(array([2, 3, 4, 5, 6, 7, 8, 9]), array([0, 1]))\n",
      "(array([0, 1, 4, 5, 6, 7, 8, 9]), array([2, 3]))\n",
      "(array([0, 1, 2, 3, 6, 7, 8, 9]), array([4, 5]))\n",
      "(array([0, 1, 2, 3, 4, 5, 8, 9]), array([6, 7]))\n",
      "(array([0, 1, 2, 3, 4, 5, 6, 7]), array([8, 9]))\n"
     ]
    }
   ],
   "source": [
    "# Let's see what the grid looks like...\n",
    "for parameters in parameter_grid:\n",
    "    print(parameters)\n",
    "\n",
    "print('-'*40)\n",
    "\n",
    "# ...and the k-folding\n",
    "for train_index, test_index in kfold.split(X, y):\n",
    "    print(train_index, test_index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'mean_score': [0.53625350385827031,\n",
      "                0.38365189527912463,\n",
      "                0.55310431548829675,\n",
      "                0.44358784922652716,\n",
      "                0.54948020780469675,\n",
      "                0.68983372669340581],\n",
      " 'parameters': [{'foo': 'bar', 'max_depth': 2},\n",
      "                {'foo': 'bar', 'max_depth': 3},\n",
      "                {'foo': 'bar', 'max_depth': 4},\n",
      "                {'foo': 'baz', 'max_depth': 2},\n",
      "                {'foo': 'baz', 'max_depth': 3},\n",
      "                {'foo': 'baz', 'max_depth': 4}],\n",
      " 'score': [array([ 0.32875334,  0.84342939,  0.18596959,  0.76166583,  0.56144937]),\n",
      "           array([ 0.04631226,  0.8176691 ,  0.03392496,  0.82079794,  0.19955522]),\n",
      "           array([ 0.6222717 ,  0.63061044,  0.30095591,  0.31146542,  0.90021811]),\n",
      "           array([ 0.18428163,  0.23682483,  0.7379395 ,  0.33171623,  0.72717707]),\n",
      "           array([ 0.64512578,  0.485501  ,  0.08040944,  0.89287692,  0.6434879 ]),\n",
      "           array([ 0.72349646,  0.77137286,  0.74873705,  0.69683843,  0.50872385])],\n",
      " 'std_score': [0.24962280085738531,\n",
      "               0.36040703124664514,\n",
      "               0.22505443364865774,\n",
      "               0.24065396682144483,\n",
      "               0.26833842592133511,\n",
      "               0.093916202727686665]}\n",
      "Best score: 0.69 +/- 0.25\n",
      "Best parameter set: {'foo': 'bar', 'max_depth': 2}\n"
     ]
    }
   ],
   "source": [
    "results = {\n",
    "    'parameters': [],\n",
    "    'score': [],\n",
    "    'mean_score': [],\n",
    "    'std_score': []\n",
    "}\n",
    "\n",
    "# A poor-man's GridSearchCV.\n",
    "#   For every parameter combination:\n",
    "#     For every k-fold:\n",
    "#        1. Fit a classifier on the training data\n",
    "#        2. Test the fitted classifier on the testing data\n",
    "#        3. Record the metric using the classifier response on the test data\n",
    "#     1. Compute averages and variances using the ensemble of scores\n",
    "for parameters in parameter_grid:\n",
    "    results['parameters'].append(parameters)\n",
    "    scores = []\n",
    "\n",
    "    for train_index, test_index in kfold.split(X, y):\n",
    "        Xtrain, Xtest = X.iloc[train_index], X.iloc[test_index]\n",
    "        ytrain, ytest = y.iloc[train_index], y.iloc[test_index]\n",
    "        Wtrain = W.iloc[train_index]\n",
    "        Wtest = W.iloc[test_index]\n",
    "        \n",
    "        # TODO: probably want to store the clf objects,\n",
    "        # so we can retrieve the best one later\n",
    "        # clf = ...\n",
    "        # clf.fit(...)\n",
    "        # score = compute_score(clf, Xtest, ytest)\n",
    "        score = np.random.uniform()\n",
    "        scores.append(score)\n",
    "    \n",
    "    scores = np.array(scores)\n",
    "    mean = scores.mean()\n",
    "    std = scores.std()\n",
    "    \n",
    "    results['score'].append(scores)\n",
    "    results['mean_score'].append(mean)\n",
    "    results['std_score'].append(std)\n",
    "\n",
    "best_score = np.max(results['mean_score'])\n",
    "best_score_idx = np.argmax(results['mean_score'])\n",
    "best_score_std = results['std_score'][int(best_score)]\n",
    "best_parameters = results['parameters'][int(best_score)]\n",
    "   \n",
    "from pprint import pprint\n",
    "pprint(results)\n",
    "print('Best score: {0:.2f} +/- {1:.2f}'.format(best_score, best_score_std))\n",
    "print('Best parameter set: {0}'.format(best_parameters))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import numpy as np\n",
	"from sklearn import model_selection"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"parameter_grid = model_selection.ParameterGrid(dict(max_depth=[2, 3, 4], foo=['bar', 'baz']))\n",
	"kfold = model_selection.StratifiedKFold(n_splits=5, random_state=42)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 37,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>a</th>\n",
	" <th>b</th>\n",
	" <th>label</th>\n",
	" <th>w</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>0</td>\n",
	" <td>10</td>\n",
	" <td>0</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>1</td>\n",
	" <td>11</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>2</td>\n",
	" <td>12</td>\n",
	" <td>0</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>3</td>\n",
	" <td>13</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>4</td>\n",
	" <td>14</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>5</th>\n",
	" <td>5</td>\n",
	" <td>15</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>6</th>\n",
	" <td>6</td>\n",
	" <td>16</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>7</th>\n",
	" <td>7</td>\n",
	" <td>17</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>8</th>\n",
	" <td>8</td>\n",
	" <td>18</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>9</th>\n",
	" <td>9</td>\n",
	" <td>19</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" a b label w\n",
	"0 0 10 0 1\n",
	"1 1 11 0 0\n",
	"2 2 12 0 1\n",
	"3 3 13 0 0\n",
	"4 4 14 0 0\n",
	"5 5 15 1 0\n",
	"6 6 16 1 0\n",
	"7 7 17 1 0\n",
	"8 8 18 1 0\n",
	"9 9 19 1 0"
	]
	},
	"execution_count": 37,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df = pd.DataFrame(dict(a=range(10), b=range(10, 20),\n",
	" # Weights\n",
	" w=np.random.choice([0, 1], size=10),\n",
	" # Binary classification labels\n",
	" label=np.array([0]5 + [1]5)))\n",
	"X = df[['a', 'b']]\n",
	"y = df.label\n",
	"W = df.w\n",
	"df"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 44,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"{'foo': 'bar', 'max_depth': 2}\n",
	"{'foo': 'bar', 'max_depth': 3}\n",
	"{'foo': 'bar', 'max_depth': 4}\n",
	"{'foo': 'baz', 'max_depth': 2}\n",
	"{'foo': 'baz', 'max_depth': 3}\n",
	"{'foo': 'baz', 'max_depth': 4}\n",
	"----------------------------------------\n",
	"(array([2, 3, 4, 5, 6, 7, 8, 9]), array([0, 1]))\n",
	"(array([0, 1, 4, 5, 6, 7, 8, 9]), array([2, 3]))\n",
	"(array([0, 1, 2, 3, 6, 7, 8, 9]), array([4, 5]))\n",
	"(array([0, 1, 2, 3, 4, 5, 8, 9]), array([6, 7]))\n",
	"(array([0, 1, 2, 3, 4, 5, 6, 7]), array([8, 9]))\n"
	]
	}
	],
	"source": [
	"# Let's see what the grid looks like...\n",
	"for parameters in parameter_grid:\n",
	" print(parameters)\n",
	"\n",
	"print('-'*40)\n",
	"\n",
	"# ...and the k-folding\n",
	"for train_index, test_index in kfold.split(X, y):\n",
	" print(train_index, test_index)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 67,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"{'mean_score': [0.53625350385827031,\n",
	" 0.38365189527912463,\n",
	" 0.55310431548829675,\n",
	" 0.44358784922652716,\n",
	" 0.54948020780469675,\n",
	" 0.68983372669340581],\n",
	" 'parameters': [{'foo': 'bar', 'max_depth': 2},\n",
	" {'foo': 'bar', 'max_depth': 3},\n",
	" {'foo': 'bar', 'max_depth': 4},\n",
	" {'foo': 'baz', 'max_depth': 2},\n",
	" {'foo': 'baz', 'max_depth': 3},\n",
	" {'foo': 'baz', 'max_depth': 4}],\n",
	" 'score': [array([ 0.32875334, 0.84342939, 0.18596959, 0.76166583, 0.56144937]),\n",
	" array([ 0.04631226, 0.8176691 , 0.03392496, 0.82079794, 0.19955522]),\n",
	" array([ 0.6222717 , 0.63061044, 0.30095591, 0.31146542, 0.90021811]),\n",
	" array([ 0.18428163, 0.23682483, 0.7379395 , 0.33171623, 0.72717707]),\n",
	" array([ 0.64512578, 0.485501 , 0.08040944, 0.89287692, 0.6434879 ]),\n",
	" array([ 0.72349646, 0.77137286, 0.74873705, 0.69683843, 0.50872385])],\n",
	" 'std_score': [0.24962280085738531,\n",
	" 0.36040703124664514,\n",
	" 0.22505443364865774,\n",
	" 0.24065396682144483,\n",
	" 0.26833842592133511,\n",
	" 0.093916202727686665]}\n",
	"Best score: 0.69 +/- 0.25\n",
	"Best parameter set: {'foo': 'bar', 'max_depth': 2}\n"
	]
	}
	],
	"source": [
	"results = {\n",
	" 'parameters': [],\n",
	" 'score': [],\n",
	" 'mean_score': [],\n",
	" 'std_score': []\n",
	"}\n",
	"\n",
	"# A poor-man's GridSearchCV.\n",
	"# For every parameter combination:\n",
	"# For every k-fold:\n",
	"# 1. Fit a classifier on the training data\n",
	"# 2. Test the fitted classifier on the testing data\n",
	"# 3. Record the metric using the classifier response on the test data\n",
	"# 1. Compute averages and variances using the ensemble of scores\n",
	"for parameters in parameter_grid:\n",
	" results['parameters'].append(parameters)\n",
	" scores = []\n",
	"\n",
	" for train_index, test_index in kfold.split(X, y):\n",
	" Xtrain, Xtest = X.iloc[train_index], X.iloc[test_index]\n",
	" ytrain, ytest = y.iloc[train_index], y.iloc[test_index]\n",
	" Wtrain = W.iloc[train_index]\n",
	" Wtest = W.iloc[test_index]\n",
	" \n",
	" # TODO: probably want to store the clf objects,\n",
	" # so we can retrieve the best one later\n",
	" # clf = ...\n",
	" # clf.fit(...)\n",
	" # score = compute_score(clf, Xtest, ytest)\n",
	" score = np.random.uniform()\n",
	" scores.append(score)\n",
	" \n",
	" scores = np.array(scores)\n",
	" mean = scores.mean()\n",
	" std = scores.std()\n",
	" \n",
	" results['score'].append(scores)\n",
	" results['mean_score'].append(mean)\n",
	" results['std_score'].append(std)\n",
	"\n",
	"best_score = np.max(results['mean_score'])\n",
	"best_score_idx = np.argmax(results['mean_score'])\n",
	"best_score_std = results['std_score'][int(best_score)]\n",
	"best_parameters = results['parameters'][int(best_score)]\n",
	" \n",
	"from pprint import pprint\n",
	"pprint(results)\n",
	"print('Best score: {0:.2f} +/- {1:.2f}'.format(best_score, best_score_std))\n",
	"print('Best parameter set: {0}'.format(best_parameters))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.11"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 1
	}