Created
November 2, 2016 21:01
-
-
Save kanungo/a7abd140f972951564024db4dd68d87e to your computer and use it in GitHub Desktop.
wk-10b-Fall 2016-random-forest
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# http://www.analyticbridge.com/profiles/blogs/random-forest-in-python" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>x1</th>\n", | |
" <th>x2</th>\n", | |
" <th>x3</th>\n", | |
" <th>x4</th>\n", | |
" <th>x5</th>\n", | |
" <th>x6</th>\n", | |
" <th>x7</th>\n", | |
" <th>Y</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>4.19</td>\n", | |
" <td>16.15</td>\n", | |
" <td>12.05</td>\n", | |
" <td>32.62</td>\n", | |
" <td>46.90</td>\n", | |
" <td>62.87</td>\n", | |
" <td>64.69</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>3.42</td>\n", | |
" <td>11.03</td>\n", | |
" <td>13.21</td>\n", | |
" <td>13.81</td>\n", | |
" <td>30.18</td>\n", | |
" <td>55.04</td>\n", | |
" <td>62.54</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>5.20</td>\n", | |
" <td>6.22</td>\n", | |
" <td>15.15</td>\n", | |
" <td>35.29</td>\n", | |
" <td>28.50</td>\n", | |
" <td>36.53</td>\n", | |
" <td>91.71</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4.31</td>\n", | |
" <td>8.82</td>\n", | |
" <td>16.89</td>\n", | |
" <td>27.40</td>\n", | |
" <td>43.41</td>\n", | |
" <td>65.96</td>\n", | |
" <td>78.77</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>4.32</td>\n", | |
" <td>12.75</td>\n", | |
" <td>18.66</td>\n", | |
" <td>34.15</td>\n", | |
" <td>13.97</td>\n", | |
" <td>51.44</td>\n", | |
" <td>50.80</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" x1 x2 x3 x4 x5 x6 x7 Y\n", | |
"0 4.19 16.15 12.05 32.62 46.90 62.87 64.69 1\n", | |
"1 3.42 11.03 13.21 13.81 30.18 55.04 62.54 1\n", | |
"2 5.20 6.22 15.15 35.29 28.50 36.53 91.71 1\n", | |
"3 4.31 8.82 16.89 27.40 43.41 65.96 78.77 1\n", | |
"4 4.32 12.75 18.66 34.15 13.97 51.44 50.80 1" | |
] | |
}, | |
"execution_count": 19, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Import data\n", | |
"import pandas as pd\n", | |
"df=pd.read_csv(\"/home/drk/kanungo/DNSC6211/w10/wk10b-datav2.csv\")\n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# Create a training and test set\n", | |
"from sklearn.cross_validation import train_test_split\n", | |
"train, test = train_test_split(df, train_size=0.75, random_state=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"dfTrain = pd.DataFrame(train, columns=df.columns)\n", | |
"dfTest = pd.DataFrame(test, columns=df.columns)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.ensemble import RandomForestClassifier" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7']" | |
] | |
}, | |
"execution_count": 24, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# the data have to be in a numpy array in order for\n", | |
"# the random forest algorithm to accept it!\n", | |
"# Also, output must be separated.\n", | |
"cols = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7'] \n", | |
"colsRes = ['Y']\n", | |
"cols" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[ 3.77, 6.84, 21.83, ..., 59.28, 68.43, 82.4 ],\n", | |
" [ 34.66, 29.6 , 52.32, ..., 95.03, 33.69, 93.63],\n", | |
" [ 57.75, 16.83, 48.97, ..., 104.63, 36.01, 132.76],\n", | |
" ..., \n", | |
" [ 6.91, 12.24, 16.98, ..., 40.74, 49.66, 48.8 ],\n", | |
" [ 25.8 , 19.43, 69.56, ..., 68.83, 86.62, 43.73],\n", | |
" [ 4.88, 15.59, 16.9 , ..., 27.09, 99.51, 75.68]])" | |
] | |
}, | |
"execution_count": 25, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"trainArr = dfTrain.as_matrix(cols) #training array\n", | |
"trainRes = dfTrain.as_matrix(colsRes) # training results\n", | |
"trainArr" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/drk/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:3: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", | |
" app.launch_new_instance()\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", | |
" max_depth=None, max_features='auto', max_leaf_nodes=None,\n", | |
" min_samples_leaf=1, min_samples_split=2,\n", | |
" min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,\n", | |
" oob_score=False, random_state=None, verbose=0,\n", | |
" warm_start=False)" | |
] | |
}, | |
"execution_count": 34, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"## Training!\n", | |
"rf = RandomForestClassifier() # initialize\n", | |
"rf.fit(trainArr, trainRes) # fit the data to the algorithm" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 35, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"## Testing!\n", | |
"# put the test data in the same format!\n", | |
"testArr = dfTest.as_matrix(cols)\n", | |
"results = rf.predict(testArr)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# Add result back to the data frame, so I can compare side-by-side\n", | |
"dfTest['predictions'] = results" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>x1</th>\n", | |
" <th>x2</th>\n", | |
" <th>x3</th>\n", | |
" <th>x4</th>\n", | |
" <th>x5</th>\n", | |
" <th>x6</th>\n", | |
" <th>x7</th>\n", | |
" <th>Y</th>\n", | |
" <th>predictions</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>529</th>\n", | |
" <td>39.29</td>\n", | |
" <td>17.12</td>\n", | |
" <td>31.87</td>\n", | |
" <td>23.34</td>\n", | |
" <td>120.24</td>\n", | |
" <td>18.47</td>\n", | |
" <td>188.23</td>\n", | |
" <td>3</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>403</th>\n", | |
" <td>33.73</td>\n", | |
" <td>18.52</td>\n", | |
" <td>53.75</td>\n", | |
" <td>24.39</td>\n", | |
" <td>103.64</td>\n", | |
" <td>34.97</td>\n", | |
" <td>122.41</td>\n", | |
" <td>3</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>464</th>\n", | |
" <td>17.83</td>\n", | |
" <td>24.59</td>\n", | |
" <td>80.69</td>\n", | |
" <td>15.04</td>\n", | |
" <td>90.65</td>\n", | |
" <td>21.46</td>\n", | |
" <td>127.57</td>\n", | |
" <td>3</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>142</th>\n", | |
" <td>16.19</td>\n", | |
" <td>38.34</td>\n", | |
" <td>39.56</td>\n", | |
" <td>44.62</td>\n", | |
" <td>64.12</td>\n", | |
" <td>55.20</td>\n", | |
" <td>52.93</td>\n", | |
" <td>2</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>521</th>\n", | |
" <td>40.34</td>\n", | |
" <td>18.47</td>\n", | |
" <td>52.77</td>\n", | |
" <td>7.86</td>\n", | |
" <td>116.33</td>\n", | |
" <td>49.84</td>\n", | |
" <td>125.93</td>\n", | |
" <td>3</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" x1 x2 x3 x4 x5 x6 x7 Y predictions\n", | |
"529 39.29 17.12 31.87 23.34 120.24 18.47 188.23 3 3\n", | |
"403 33.73 18.52 53.75 24.39 103.64 34.97 122.41 3 3\n", | |
"464 17.83 24.59 80.69 15.04 90.65 21.46 127.57 3 3\n", | |
"142 16.19 38.34 39.56 44.62 64.12 55.20 52.93 2 2\n", | |
"521 40.34 18.47 52.77 7.86 116.33 49.84 125.93 3 3" | |
] | |
}, | |
"execution_count": 37, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"dfTest.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 38, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" precision recall f1-score support\n", | |
"\n", | |
" 1 0.97 1.00 0.98 28\n", | |
" 2 0.94 0.94 0.94 34\n", | |
" 3 0.99 0.97 0.98 72\n", | |
"\n", | |
"avg / total 0.97 0.97 0.97 134\n", | |
"\n", | |
"[[28 0 0]\n", | |
" [ 1 32 1]\n", | |
" [ 0 2 70]]\n" | |
] | |
} | |
], | |
"source": [ | |
"from sklearn import metrics\n", | |
"\n", | |
"# summarize the fit of the model\n", | |
"print(metrics.classification_report(dfTest.Y, dfTest.predictions))\n", | |
"print(metrics.confusion_matrix(dfTest.Y, dfTest.predictions))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 39, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# http://stackoverflow.com/questions/29221849/splitting-data-into-test-and-train-making-a-logistic-regression-model-in-pandas\n", | |
"import pandas as pd\n", | |
"from sklearn.cross_validation import train_test_split\n", | |
"import statsmodels.api as sm\n", | |
"\n", | |
"quality = pd.read_csv(\"https://courses.edx.org/c4x/MITx/15.071x/asset/quality.csv\")\n", | |
"train, test = train_test_split(quality, train_size=0.75, random_state=1)\n", | |
"\n", | |
"qualityTrain = pd.DataFrame(train, columns=quality.columns)\n", | |
"qualityTest = pd.DataFrame(test, columns=quality.columns)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"anaconda-cloud": {}, | |
"kernelspec": { | |
"display_name": "Python [conda root]", | |
"language": "python", | |
"name": "conda-root-py" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment