Skip to content

Instantly share code, notes, and snippets.

@kanungo
Created November 2, 2016 21:01
Show Gist options
  • Save kanungo/a7abd140f972951564024db4dd68d87e to your computer and use it in GitHub Desktop.
Save kanungo/a7abd140f972951564024db4dd68d87e to your computer and use it in GitHub Desktop.
wk-10b-Fall 2016-random-forest
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# http://www.analyticbridge.com/profiles/blogs/random-forest-in-python"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>x1</th>\n",
" <th>x2</th>\n",
" <th>x3</th>\n",
" <th>x4</th>\n",
" <th>x5</th>\n",
" <th>x6</th>\n",
" <th>x7</th>\n",
" <th>Y</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4.19</td>\n",
" <td>16.15</td>\n",
" <td>12.05</td>\n",
" <td>32.62</td>\n",
" <td>46.90</td>\n",
" <td>62.87</td>\n",
" <td>64.69</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.42</td>\n",
" <td>11.03</td>\n",
" <td>13.21</td>\n",
" <td>13.81</td>\n",
" <td>30.18</td>\n",
" <td>55.04</td>\n",
" <td>62.54</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5.20</td>\n",
" <td>6.22</td>\n",
" <td>15.15</td>\n",
" <td>35.29</td>\n",
" <td>28.50</td>\n",
" <td>36.53</td>\n",
" <td>91.71</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4.31</td>\n",
" <td>8.82</td>\n",
" <td>16.89</td>\n",
" <td>27.40</td>\n",
" <td>43.41</td>\n",
" <td>65.96</td>\n",
" <td>78.77</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4.32</td>\n",
" <td>12.75</td>\n",
" <td>18.66</td>\n",
" <td>34.15</td>\n",
" <td>13.97</td>\n",
" <td>51.44</td>\n",
" <td>50.80</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" x1 x2 x3 x4 x5 x6 x7 Y\n",
"0 4.19 16.15 12.05 32.62 46.90 62.87 64.69 1\n",
"1 3.42 11.03 13.21 13.81 30.18 55.04 62.54 1\n",
"2 5.20 6.22 15.15 35.29 28.50 36.53 91.71 1\n",
"3 4.31 8.82 16.89 27.40 43.41 65.96 78.77 1\n",
"4 4.32 12.75 18.66 34.15 13.97 51.44 50.80 1"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Import data\n",
"import pandas as pd\n",
"df=pd.read_csv(\"/home/drk/kanungo/DNSC6211/w10/wk10b-datav2.csv\")\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Create a training and test set\n",
"from sklearn.cross_validation import train_test_split\n",
"train, test = train_test_split(df, train_size=0.75, random_state=1)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"dfTrain = pd.DataFrame(train, columns=df.columns)\n",
"dfTest = pd.DataFrame(test, columns=df.columns)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestClassifier"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7']"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# the data have to be in a numpy array in order for\n",
"# the random forest algorithm to accept it!\n",
"# Also, output must be separated.\n",
"cols = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7'] \n",
"colsRes = ['Y']\n",
"cols"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 3.77, 6.84, 21.83, ..., 59.28, 68.43, 82.4 ],\n",
" [ 34.66, 29.6 , 52.32, ..., 95.03, 33.69, 93.63],\n",
" [ 57.75, 16.83, 48.97, ..., 104.63, 36.01, 132.76],\n",
" ..., \n",
" [ 6.91, 12.24, 16.98, ..., 40.74, 49.66, 48.8 ],\n",
" [ 25.8 , 19.43, 69.56, ..., 68.83, 86.62, 43.73],\n",
" [ 4.88, 15.59, 16.9 , ..., 27.09, 99.51, 75.68]])"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"trainArr = dfTrain.as_matrix(cols) #training array\n",
"trainRes = dfTrain.as_matrix(colsRes) # training results\n",
"trainArr"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/drk/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:3: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
" app.launch_new_instance()\n"
]
},
{
"data": {
"text/plain": [
"RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
" max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
" min_samples_leaf=1, min_samples_split=2,\n",
" min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,\n",
" oob_score=False, random_state=None, verbose=0,\n",
" warm_start=False)"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"## Training!\n",
"rf = RandomForestClassifier() # initialize\n",
"rf.fit(trainArr, trainRes) # fit the data to the algorithm"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"## Testing!\n",
"# put the test data in the same format!\n",
"testArr = dfTest.as_matrix(cols)\n",
"results = rf.predict(testArr)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Add result back to the data frame, so I can compare side-by-side\n",
"dfTest['predictions'] = results"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>x1</th>\n",
" <th>x2</th>\n",
" <th>x3</th>\n",
" <th>x4</th>\n",
" <th>x5</th>\n",
" <th>x6</th>\n",
" <th>x7</th>\n",
" <th>Y</th>\n",
" <th>predictions</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>529</th>\n",
" <td>39.29</td>\n",
" <td>17.12</td>\n",
" <td>31.87</td>\n",
" <td>23.34</td>\n",
" <td>120.24</td>\n",
" <td>18.47</td>\n",
" <td>188.23</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>403</th>\n",
" <td>33.73</td>\n",
" <td>18.52</td>\n",
" <td>53.75</td>\n",
" <td>24.39</td>\n",
" <td>103.64</td>\n",
" <td>34.97</td>\n",
" <td>122.41</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>464</th>\n",
" <td>17.83</td>\n",
" <td>24.59</td>\n",
" <td>80.69</td>\n",
" <td>15.04</td>\n",
" <td>90.65</td>\n",
" <td>21.46</td>\n",
" <td>127.57</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>142</th>\n",
" <td>16.19</td>\n",
" <td>38.34</td>\n",
" <td>39.56</td>\n",
" <td>44.62</td>\n",
" <td>64.12</td>\n",
" <td>55.20</td>\n",
" <td>52.93</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>521</th>\n",
" <td>40.34</td>\n",
" <td>18.47</td>\n",
" <td>52.77</td>\n",
" <td>7.86</td>\n",
" <td>116.33</td>\n",
" <td>49.84</td>\n",
" <td>125.93</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" x1 x2 x3 x4 x5 x6 x7 Y predictions\n",
"529 39.29 17.12 31.87 23.34 120.24 18.47 188.23 3 3\n",
"403 33.73 18.52 53.75 24.39 103.64 34.97 122.41 3 3\n",
"464 17.83 24.59 80.69 15.04 90.65 21.46 127.57 3 3\n",
"142 16.19 38.34 39.56 44.62 64.12 55.20 52.93 2 2\n",
"521 40.34 18.47 52.77 7.86 116.33 49.84 125.93 3 3"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfTest.head()"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 1 0.97 1.00 0.98 28\n",
" 2 0.94 0.94 0.94 34\n",
" 3 0.99 0.97 0.98 72\n",
"\n",
"avg / total 0.97 0.97 0.97 134\n",
"\n",
"[[28 0 0]\n",
" [ 1 32 1]\n",
" [ 0 2 70]]\n"
]
}
],
"source": [
"from sklearn import metrics\n",
"\n",
"# summarize the fit of the model\n",
"print(metrics.classification_report(dfTest.Y, dfTest.predictions))\n",
"print(metrics.confusion_matrix(dfTest.Y, dfTest.predictions))"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# http://stackoverflow.com/questions/29221849/splitting-data-into-test-and-train-making-a-logistic-regression-model-in-pandas\n",
"import pandas as pd\n",
"from sklearn.cross_validation import train_test_split\n",
"import statsmodels.api as sm\n",
"\n",
"quality = pd.read_csv(\"https://courses.edx.org/c4x/MITx/15.071x/asset/quality.csv\")\n",
"train, test = train_test_split(quality, train_size=0.75, random_state=1)\n",
"\n",
"qualityTrain = pd.DataFrame(train, columns=quality.columns)\n",
"qualityTest = pd.DataFrame(test, columns=quality.columns)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [conda root]",
"language": "python",
"name": "conda-root-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment