kanungo/wk-10b-Fall 2016-random-forest.ipynb

## wk-10b-Fall 2016-random-forest.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# http://www.analyticbridge.com/profiles/blogs/random-forest-in-python"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>x1</th>\n",
       "      <th>x2</th>\n",
       "      <th>x3</th>\n",
       "      <th>x4</th>\n",
       "      <th>x5</th>\n",
       "      <th>x6</th>\n",
       "      <th>x7</th>\n",
       "      <th>Y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4.19</td>\n",
       "      <td>16.15</td>\n",
       "      <td>12.05</td>\n",
       "      <td>32.62</td>\n",
       "      <td>46.90</td>\n",
       "      <td>62.87</td>\n",
       "      <td>64.69</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3.42</td>\n",
       "      <td>11.03</td>\n",
       "      <td>13.21</td>\n",
       "      <td>13.81</td>\n",
       "      <td>30.18</td>\n",
       "      <td>55.04</td>\n",
       "      <td>62.54</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5.20</td>\n",
       "      <td>6.22</td>\n",
       "      <td>15.15</td>\n",
       "      <td>35.29</td>\n",
       "      <td>28.50</td>\n",
       "      <td>36.53</td>\n",
       "      <td>91.71</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.31</td>\n",
       "      <td>8.82</td>\n",
       "      <td>16.89</td>\n",
       "      <td>27.40</td>\n",
       "      <td>43.41</td>\n",
       "      <td>65.96</td>\n",
       "      <td>78.77</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4.32</td>\n",
       "      <td>12.75</td>\n",
       "      <td>18.66</td>\n",
       "      <td>34.15</td>\n",
       "      <td>13.97</td>\n",
       "      <td>51.44</td>\n",
       "      <td>50.80</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     x1     x2     x3     x4     x5     x6     x7  Y\n",
       "0  4.19  16.15  12.05  32.62  46.90  62.87  64.69  1\n",
       "1  3.42  11.03  13.21  13.81  30.18  55.04  62.54  1\n",
       "2  5.20   6.22  15.15  35.29  28.50  36.53  91.71  1\n",
       "3  4.31   8.82  16.89  27.40  43.41  65.96  78.77  1\n",
       "4  4.32  12.75  18.66  34.15  13.97  51.44  50.80  1"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Import data\n",
    "import pandas as pd\n",
    "df=pd.read_csv(\"/home/drk/kanungo/DNSC6211/w10/wk10b-datav2.csv\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Create a training and test set\n",
    "from sklearn.cross_validation import train_test_split\n",
    "train, test = train_test_split(df, train_size=0.75, random_state=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "dfTrain = pd.DataFrame(train, columns=df.columns)\n",
    "dfTest = pd.DataFrame(test, columns=df.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7']"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# the data have to be in a numpy array in order for\n",
    "# the random forest algorithm to accept it!\n",
    "# Also, output must be separated.\n",
    "cols = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7'] \n",
    "colsRes = ['Y']\n",
    "cols"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[   3.77,    6.84,   21.83, ...,   59.28,   68.43,   82.4 ],\n",
       "       [  34.66,   29.6 ,   52.32, ...,   95.03,   33.69,   93.63],\n",
       "       [  57.75,   16.83,   48.97, ...,  104.63,   36.01,  132.76],\n",
       "       ..., \n",
       "       [   6.91,   12.24,   16.98, ...,   40.74,   49.66,   48.8 ],\n",
       "       [  25.8 ,   19.43,   69.56, ...,   68.83,   86.62,   43.73],\n",
       "       [   4.88,   15.59,   16.9 , ...,   27.09,   99.51,   75.68]])"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainArr = dfTrain.as_matrix(cols) #training array\n",
    "trainRes = dfTrain.as_matrix(colsRes) # training results\n",
    "trainArr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/drk/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:3: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
      "  app.launch_new_instance()\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
       "            max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
       "            min_samples_leaf=1, min_samples_split=2,\n",
       "            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,\n",
       "            oob_score=False, random_state=None, verbose=0,\n",
       "            warm_start=False)"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## Training!\n",
    "rf = RandomForestClassifier() # initialize\n",
    "rf.fit(trainArr, trainRes) # fit the data to the algorithm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "## Testing!\n",
    "# put the test data in the same format!\n",
    "testArr = dfTest.as_matrix(cols)\n",
    "results = rf.predict(testArr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Add result back to the data frame, so I can compare side-by-side\n",
    "dfTest['predictions'] = results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>x1</th>\n",
       "      <th>x2</th>\n",
       "      <th>x3</th>\n",
       "      <th>x4</th>\n",
       "      <th>x5</th>\n",
       "      <th>x6</th>\n",
       "      <th>x7</th>\n",
       "      <th>Y</th>\n",
       "      <th>predictions</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>529</th>\n",
       "      <td>39.29</td>\n",
       "      <td>17.12</td>\n",
       "      <td>31.87</td>\n",
       "      <td>23.34</td>\n",
       "      <td>120.24</td>\n",
       "      <td>18.47</td>\n",
       "      <td>188.23</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>403</th>\n",
       "      <td>33.73</td>\n",
       "      <td>18.52</td>\n",
       "      <td>53.75</td>\n",
       "      <td>24.39</td>\n",
       "      <td>103.64</td>\n",
       "      <td>34.97</td>\n",
       "      <td>122.41</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>464</th>\n",
       "      <td>17.83</td>\n",
       "      <td>24.59</td>\n",
       "      <td>80.69</td>\n",
       "      <td>15.04</td>\n",
       "      <td>90.65</td>\n",
       "      <td>21.46</td>\n",
       "      <td>127.57</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>142</th>\n",
       "      <td>16.19</td>\n",
       "      <td>38.34</td>\n",
       "      <td>39.56</td>\n",
       "      <td>44.62</td>\n",
       "      <td>64.12</td>\n",
       "      <td>55.20</td>\n",
       "      <td>52.93</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>521</th>\n",
       "      <td>40.34</td>\n",
       "      <td>18.47</td>\n",
       "      <td>52.77</td>\n",
       "      <td>7.86</td>\n",
       "      <td>116.33</td>\n",
       "      <td>49.84</td>\n",
       "      <td>125.93</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        x1     x2     x3     x4      x5     x6      x7  Y  predictions\n",
       "529  39.29  17.12  31.87  23.34  120.24  18.47  188.23  3            3\n",
       "403  33.73  18.52  53.75  24.39  103.64  34.97  122.41  3            3\n",
       "464  17.83  24.59  80.69  15.04   90.65  21.46  127.57  3            3\n",
       "142  16.19  38.34  39.56  44.62   64.12  55.20   52.93  2            2\n",
       "521  40.34  18.47  52.77   7.86  116.33  49.84  125.93  3            3"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfTest.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "          1       0.97      1.00      0.98        28\n",
      "          2       0.94      0.94      0.94        34\n",
      "          3       0.99      0.97      0.98        72\n",
      "\n",
      "avg / total       0.97      0.97      0.97       134\n",
      "\n",
      "[[28  0  0]\n",
      " [ 1 32  1]\n",
      " [ 0  2 70]]\n"
     ]
    }
   ],
   "source": [
    "from sklearn import metrics\n",
    "\n",
    "# summarize the fit of the model\n",
    "print(metrics.classification_report(dfTest.Y, dfTest.predictions))\n",
    "print(metrics.confusion_matrix(dfTest.Y, dfTest.predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# http://stackoverflow.com/questions/29221849/splitting-data-into-test-and-train-making-a-logistic-regression-model-in-pandas\n",
    "import pandas as pd\n",
    "from sklearn.cross_validation import train_test_split\n",
    "import statsmodels.api as sm\n",
    "\n",
    "quality = pd.read_csv(\"https://courses.edx.org/c4x/MITx/15.071x/asset/quality.csv\")\n",
    "train, test = train_test_split(quality, train_size=0.75, random_state=1)\n",
    "\n",
    "qualityTrain = pd.DataFrame(train, columns=quality.columns)\n",
    "qualityTest = pd.DataFrame(test, columns=quality.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda root]",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# http://www.analyticbridge.com/profiles/blogs/random-forest-in-python"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>x1</th>\n",
	" <th>x2</th>\n",
	" <th>x3</th>\n",
	" <th>x4</th>\n",
	" <th>x5</th>\n",
	" <th>x6</th>\n",
	" <th>x7</th>\n",
	" <th>Y</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>4.19</td>\n",
	" <td>16.15</td>\n",
	" <td>12.05</td>\n",
	" <td>32.62</td>\n",
	" <td>46.90</td>\n",
	" <td>62.87</td>\n",
	" <td>64.69</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>3.42</td>\n",
	" <td>11.03</td>\n",
	" <td>13.21</td>\n",
	" <td>13.81</td>\n",
	" <td>30.18</td>\n",
	" <td>55.04</td>\n",
	" <td>62.54</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>5.20</td>\n",
	" <td>6.22</td>\n",
	" <td>15.15</td>\n",
	" <td>35.29</td>\n",
	" <td>28.50</td>\n",
	" <td>36.53</td>\n",
	" <td>91.71</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>4.31</td>\n",
	" <td>8.82</td>\n",
	" <td>16.89</td>\n",
	" <td>27.40</td>\n",
	" <td>43.41</td>\n",
	" <td>65.96</td>\n",
	" <td>78.77</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>4.32</td>\n",
	" <td>12.75</td>\n",
	" <td>18.66</td>\n",
	" <td>34.15</td>\n",
	" <td>13.97</td>\n",
	" <td>51.44</td>\n",
	" <td>50.80</td>\n",
	" <td>1</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" x1 x2 x3 x4 x5 x6 x7 Y\n",
	"0 4.19 16.15 12.05 32.62 46.90 62.87 64.69 1\n",
	"1 3.42 11.03 13.21 13.81 30.18 55.04 62.54 1\n",
	"2 5.20 6.22 15.15 35.29 28.50 36.53 91.71 1\n",
	"3 4.31 8.82 16.89 27.40 43.41 65.96 78.77 1\n",
	"4 4.32 12.75 18.66 34.15 13.97 51.44 50.80 1"
	]
	},
	"execution_count": 19,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Import data\n",
	"import pandas as pd\n",
	"df=pd.read_csv(\"/home/drk/kanungo/DNSC6211/w10/wk10b-datav2.csv\")\n",
	"df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Create a training and test set\n",
	"from sklearn.cross_validation import train_test_split\n",
	"train, test = train_test_split(df, train_size=0.75, random_state=1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"dfTrain = pd.DataFrame(train, columns=df.columns)\n",
	"dfTest = pd.DataFrame(test, columns=df.columns)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from sklearn.ensemble import RandomForestClassifier"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7']"
	]
	},
	"execution_count": 24,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# the data have to be in a numpy array in order for\n",
	"# the random forest algorithm to accept it!\n",
	"# Also, output must be separated.\n",
	"cols = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7'] \n",
	"colsRes = ['Y']\n",
	"cols"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 25,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([[ 3.77, 6.84, 21.83, ..., 59.28, 68.43, 82.4 ],\n",
	" [ 34.66, 29.6 , 52.32, ..., 95.03, 33.69, 93.63],\n",
	" [ 57.75, 16.83, 48.97, ..., 104.63, 36.01, 132.76],\n",
	" ..., \n",
	" [ 6.91, 12.24, 16.98, ..., 40.74, 49.66, 48.8 ],\n",
	" [ 25.8 , 19.43, 69.56, ..., 68.83, 86.62, 43.73],\n",
	" [ 4.88, 15.59, 16.9 , ..., 27.09, 99.51, 75.68]])"
	]
	},
	"execution_count": 25,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"trainArr = dfTrain.as_matrix(cols) #training array\n",
	"trainRes = dfTrain.as_matrix(colsRes) # training results\n",
	"trainArr"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 34,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/home/drk/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:3: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
	" app.launch_new_instance()\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
	" max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
	" min_samples_leaf=1, min_samples_split=2,\n",
	" min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,\n",
	" oob_score=False, random_state=None, verbose=0,\n",
	" warm_start=False)"
	]
	},
	"execution_count": 34,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"## Training!\n",
	"rf = RandomForestClassifier() # initialize\n",
	"rf.fit(trainArr, trainRes) # fit the data to the algorithm"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 35,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"## Testing!\n",
	"# put the test data in the same format!\n",
	"testArr = dfTest.as_matrix(cols)\n",
	"results = rf.predict(testArr)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 36,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Add result back to the data frame, so I can compare side-by-side\n",
	"dfTest['predictions'] = results"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 37,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>x1</th>\n",
	" <th>x2</th>\n",
	" <th>x3</th>\n",
	" <th>x4</th>\n",
	" <th>x5</th>\n",
	" <th>x6</th>\n",
	" <th>x7</th>\n",
	" <th>Y</th>\n",
	" <th>predictions</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>529</th>\n",
	" <td>39.29</td>\n",
	" <td>17.12</td>\n",
	" <td>31.87</td>\n",
	" <td>23.34</td>\n",
	" <td>120.24</td>\n",
	" <td>18.47</td>\n",
	" <td>188.23</td>\n",
	" <td>3</td>\n",
	" <td>3</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>403</th>\n",
	" <td>33.73</td>\n",
	" <td>18.52</td>\n",
	" <td>53.75</td>\n",
	" <td>24.39</td>\n",
	" <td>103.64</td>\n",
	" <td>34.97</td>\n",
	" <td>122.41</td>\n",
	" <td>3</td>\n",
	" <td>3</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>464</th>\n",
	" <td>17.83</td>\n",
	" <td>24.59</td>\n",
	" <td>80.69</td>\n",
	" <td>15.04</td>\n",
	" <td>90.65</td>\n",
	" <td>21.46</td>\n",
	" <td>127.57</td>\n",
	" <td>3</td>\n",
	" <td>3</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>142</th>\n",
	" <td>16.19</td>\n",
	" <td>38.34</td>\n",
	" <td>39.56</td>\n",
	" <td>44.62</td>\n",
	" <td>64.12</td>\n",
	" <td>55.20</td>\n",
	" <td>52.93</td>\n",
	" <td>2</td>\n",
	" <td>2</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>521</th>\n",
	" <td>40.34</td>\n",
	" <td>18.47</td>\n",
	" <td>52.77</td>\n",
	" <td>7.86</td>\n",
	" <td>116.33</td>\n",
	" <td>49.84</td>\n",
	" <td>125.93</td>\n",
	" <td>3</td>\n",
	" <td>3</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" x1 x2 x3 x4 x5 x6 x7 Y predictions\n",
	"529 39.29 17.12 31.87 23.34 120.24 18.47 188.23 3 3\n",
	"403 33.73 18.52 53.75 24.39 103.64 34.97 122.41 3 3\n",
	"464 17.83 24.59 80.69 15.04 90.65 21.46 127.57 3 3\n",
	"142 16.19 38.34 39.56 44.62 64.12 55.20 52.93 2 2\n",
	"521 40.34 18.47 52.77 7.86 116.33 49.84 125.93 3 3"
	]
	},
	"execution_count": 37,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"dfTest.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 38,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" precision recall f1-score support\n",
	"\n",
	" 1 0.97 1.00 0.98 28\n",
	" 2 0.94 0.94 0.94 34\n",
	" 3 0.99 0.97 0.98 72\n",
	"\n",
	"avg / total 0.97 0.97 0.97 134\n",
	"\n",
	"[[28 0 0]\n",
	" [ 1 32 1]\n",
	" [ 0 2 70]]\n"
	]
	}
	],
	"source": [
	"from sklearn import metrics\n",
	"\n",
	"# summarize the fit of the model\n",
	"print(metrics.classification_report(dfTest.Y, dfTest.predictions))\n",
	"print(metrics.confusion_matrix(dfTest.Y, dfTest.predictions))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 39,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# http://stackoverflow.com/questions/29221849/splitting-data-into-test-and-train-making-a-logistic-regression-model-in-pandas\n",
	"import pandas as pd\n",
	"from sklearn.cross_validation import train_test_split\n",
	"import statsmodels.api as sm\n",
	"\n",
	"quality = pd.read_csv(\"https://courses.edx.org/c4x/MITx/15.071x/asset/quality.csv\")\n",
	"train, test = train_test_split(quality, train_size=0.75, random_state=1)\n",
	"\n",
	"qualityTrain = pd.DataFrame(train, columns=quality.columns)\n",
	"qualityTest = pd.DataFrame(test, columns=quality.columns)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"anaconda-cloud": {},
	"kernelspec": {
	"display_name": "Python [conda root]",
	"language": "python",
	"name": "conda-root-py"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 1
	}