Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save Vikrant79/d14da7541a364e6232b7 to your computer and use it in GitHub Desktop.
Save Vikrant79/d14da7541a364e6232b7 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Applications/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1159: DtypeWarning: Columns (8,9,10,11,12,43,157,167,177,196,214,225,228,229,231,235,238) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = self._reader.read(nrows)\n"
]
}
],
"source": [
"# load relevant libraries\n",
"\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import seaborn as sns\n",
"\n",
"%matplotlib inline\n",
"\n",
"# Read in Greenleaf Train.csv train file\n",
"url = \"/Users/Glenn/Documents/GA_Data_Science/Kaggle/SpringLeaf/train.csv\"\n",
"train = pd.read_csv(url).set_index(\"ID\")\n",
"\n",
"# Read in Greenleaf Train.csv teat file\n",
"url = \"/Users/Glenn/Documents/GA_Data_Science/Kaggle/SpringLeaf/test.csv\"\n",
"test = pd.read_csv(url).set_index(\"ID\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>VAR_0002</th>\n",
" <th>VAR_0003</th>\n",
" <th>VAR_0004</th>\n",
" <th>VAR_0006</th>\n",
" <th>VAR_0007</th>\n",
" <th>VAR_0013</th>\n",
" <th>VAR_0014</th>\n",
" <th>VAR_0015</th>\n",
" <th>VAR_0016</th>\n",
" <th>...</th>\n",
" <th>VAR_1925</th>\n",
" <th>VAR_1926</th>\n",
" <th>VAR_1927</th>\n",
" <th>VAR_1928</th>\n",
" <th>VAR_1929</th>\n",
" <th>VAR_1930</th>\n",
" <th>VAR_1931</th>\n",
" <th>VAR_1932</th>\n",
" <th>VAR_1933</th>\n",
" <th>target</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td> 145231.000000</td>\n",
" <td> 145231.000000</td>\n",
" <td> 145231.000000</td>\n",
" <td> 145231.000000</td>\n",
" <td> 145175.000000</td>\n",
" <td> 145175.00000</td>\n",
" <td> 145175.000000</td>\n",
" <td> 145175.000000</td>\n",
" <td> 145175.000000</td>\n",
" <td> 145175.000000</td>\n",
" <td>...</td>\n",
" <td> 145231.000000</td>\n",
" <td> 145231.000000</td>\n",
" <td> 145231.000000</td>\n",
" <td> 145231.000000</td>\n",
" <td> 1.452310e+05</td>\n",
" <td> 145231.000000</td>\n",
" <td> 145231.000000</td>\n",
" <td> 145231.000000</td>\n",
" <td> 145231.000000</td>\n",
" <td> 145231.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td> 145124.981808</td>\n",
" <td> 105.278040</td>\n",
" <td> 88.492285</td>\n",
" <td> 3721.946210</td>\n",
" <td> 0.693907</td>\n",
" <td> 0.51316</td>\n",
" <td> 0.693907</td>\n",
" <td> 0.494493</td>\n",
" <td> 0.476763</td>\n",
" <td> 1.447598</td>\n",
" <td>...</td>\n",
" <td> 0.549965</td>\n",
" <td> 86.664369</td>\n",
" <td> 89.828762</td>\n",
" <td> 914.417259</td>\n",
" <td> 9.904497e+08</td>\n",
" <td> 922.938505</td>\n",
" <td> 936.487905</td>\n",
" <td> 9942.162279</td>\n",
" <td> 8849.839111</td>\n",
" <td> 0.232547</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td> 83890.290627</td>\n",
" <td> 143.351237</td>\n",
" <td> 125.903323</td>\n",
" <td> 12280.012213</td>\n",
" <td> 1.604578</td>\n",
" <td> 1.17531</td>\n",
" <td> 1.604578</td>\n",
" <td> 1.115085</td>\n",
" <td> 1.000755</td>\n",
" <td> 1.132784</td>\n",
" <td>...</td>\n",
" <td> 6.361312</td>\n",
" <td> 31.347700</td>\n",
" <td> 26.928221</td>\n",
" <td> 276.361514</td>\n",
" <td> 9.725818e+07</td>\n",
" <td> 212.087416</td>\n",
" <td> 210.370833</td>\n",
" <td> 743.153332</td>\n",
" <td> 3175.797333</td>\n",
" <td> 0.422457</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td> 2.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.00000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td>...</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000e+00</td>\n",
" <td> 1.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td> 72342.500000</td>\n",
" <td> 24.000000</td>\n",
" <td> 13.000000</td>\n",
" <td> 1790.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.00000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 1.000000</td>\n",
" <td>...</td>\n",
" <td> 0.000000</td>\n",
" <td> 98.000000</td>\n",
" <td> 98.000000</td>\n",
" <td> 998.000000</td>\n",
" <td> 1.000000e+09</td>\n",
" <td> 998.000000</td>\n",
" <td> 998.000000</td>\n",
" <td> 9998.000000</td>\n",
" <td> 9998.000000</td>\n",
" <td> 0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td> 145272.000000</td>\n",
" <td> 60.000000</td>\n",
" <td> 55.000000</td>\n",
" <td> 2500.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.00000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 1.000000</td>\n",
" <td>...</td>\n",
" <td> 0.000000</td>\n",
" <td> 98.000000</td>\n",
" <td> 98.000000</td>\n",
" <td> 998.000000</td>\n",
" <td> 1.000000e+09</td>\n",
" <td> 998.000000</td>\n",
" <td> 998.000000</td>\n",
" <td> 9998.000000</td>\n",
" <td> 9998.000000</td>\n",
" <td> 0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td> 217686.500000</td>\n",
" <td> 132.000000</td>\n",
" <td> 120.000000</td>\n",
" <td> 3600.000000</td>\n",
" <td> 1.000000</td>\n",
" <td> 1.00000</td>\n",
" <td> 1.000000</td>\n",
" <td> 1.000000</td>\n",
" <td> 1.000000</td>\n",
" <td> 1.000000</td>\n",
" <td>...</td>\n",
" <td> 0.000000</td>\n",
" <td> 98.000000</td>\n",
" <td> 98.000000</td>\n",
" <td> 998.000000</td>\n",
" <td> 1.000000e+09</td>\n",
" <td> 998.000000</td>\n",
" <td> 998.000000</td>\n",
" <td> 9998.000000</td>\n",
" <td> 9998.000000</td>\n",
" <td> 0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td> 290463.000000</td>\n",
" <td> 999.000000</td>\n",
" <td> 999.000000</td>\n",
" <td> 2200000.000000</td>\n",
" <td> 68.000000</td>\n",
" <td> 66.00000</td>\n",
" <td> 68.000000</td>\n",
" <td> 67.000000</td>\n",
" <td> 35.000000</td>\n",
" <td> 39.000000</td>\n",
" <td>...</td>\n",
" <td> 99.000000</td>\n",
" <td> 99.000000</td>\n",
" <td> 99.000000</td>\n",
" <td> 999.000000</td>\n",
" <td> 1.000000e+09</td>\n",
" <td> 999.000000</td>\n",
" <td> 999.000000</td>\n",
" <td> 9999.000000</td>\n",
" <td> 9999.000000</td>\n",
" <td> 1.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>8 rows × 1883 columns</p>\n",
"</div>"
],
"text/plain": [
" ID VAR_0002 VAR_0003 VAR_0004 \\\n",
"count 145231.000000 145231.000000 145231.000000 145231.000000 \n",
"mean 145124.981808 105.278040 88.492285 3721.946210 \n",
"std 83890.290627 143.351237 125.903323 12280.012213 \n",
"min 2.000000 0.000000 0.000000 0.000000 \n",
"25% 72342.500000 24.000000 13.000000 1790.000000 \n",
"50% 145272.000000 60.000000 55.000000 2500.000000 \n",
"75% 217686.500000 132.000000 120.000000 3600.000000 \n",
"max 290463.000000 999.000000 999.000000 2200000.000000 \n",
"\n",
" VAR_0006 VAR_0007 VAR_0013 VAR_0014 \\\n",
"count 145175.000000 145175.00000 145175.000000 145175.000000 \n",
"mean 0.693907 0.51316 0.693907 0.494493 \n",
"std 1.604578 1.17531 1.604578 1.115085 \n",
"min 0.000000 0.00000 0.000000 0.000000 \n",
"25% 0.000000 0.00000 0.000000 0.000000 \n",
"50% 0.000000 0.00000 0.000000 0.000000 \n",
"75% 1.000000 1.00000 1.000000 1.000000 \n",
"max 68.000000 66.00000 68.000000 67.000000 \n",
"\n",
" VAR_0015 VAR_0016 ... VAR_1925 \\\n",
"count 145175.000000 145175.000000 ... 145231.000000 \n",
"mean 0.476763 1.447598 ... 0.549965 \n",
"std 1.000755 1.132784 ... 6.361312 \n",
"min 0.000000 0.000000 ... 0.000000 \n",
"25% 0.000000 1.000000 ... 0.000000 \n",
"50% 0.000000 1.000000 ... 0.000000 \n",
"75% 1.000000 1.000000 ... 0.000000 \n",
"max 35.000000 39.000000 ... 99.000000 \n",
"\n",
" VAR_1926 VAR_1927 VAR_1928 VAR_1929 \\\n",
"count 145231.000000 145231.000000 145231.000000 1.452310e+05 \n",
"mean 86.664369 89.828762 914.417259 9.904497e+08 \n",
"std 31.347700 26.928221 276.361514 9.725818e+07 \n",
"min 0.000000 0.000000 0.000000 0.000000e+00 \n",
"25% 98.000000 98.000000 998.000000 1.000000e+09 \n",
"50% 98.000000 98.000000 998.000000 1.000000e+09 \n",
"75% 98.000000 98.000000 998.000000 1.000000e+09 \n",
"max 99.000000 99.000000 999.000000 1.000000e+09 \n",
"\n",
" VAR_1930 VAR_1931 VAR_1932 VAR_1933 \\\n",
"count 145231.000000 145231.000000 145231.000000 145231.000000 \n",
"mean 922.938505 936.487905 9942.162279 8849.839111 \n",
"std 212.087416 210.370833 743.153332 3175.797333 \n",
"min 1.000000 0.000000 0.000000 0.000000 \n",
"25% 998.000000 998.000000 9998.000000 9998.000000 \n",
"50% 998.000000 998.000000 9998.000000 9998.000000 \n",
"75% 998.000000 998.000000 9998.000000 9998.000000 \n",
"max 999.000000 999.000000 9999.000000 9999.000000 \n",
"\n",
" target \n",
"count 145231.000000 \n",
"mean 0.232547 \n",
"std 0.422457 \n",
"min 0.000000 \n",
"25% 0.000000 \n",
"50% 0.000000 \n",
"75% 0.000000 \n",
"max 1.000000 \n",
"\n",
"[8 rows x 1883 columns]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# lcd.describe()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# url1 = \"/Users/Glenn/Documents/GA_Data_Science/Kaggle/GreenLeaf/train_describe.txt\"\n",
"# f = open(url1, 'w')\n",
"# f.write(str(lcd.describe()))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# f.close()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# remove constants\n",
"nunique = pd.Series([train[col].nunique() for col in train.columns], index = train.columns)\n",
"constants = nunique[nunique<2].index.tolist()\n",
"train = train.drop(constants,axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Applications/anaconda/lib/python2.7/site-packages/numpy/lib/arraysetops.py:198: FutureWarning: numpy not_equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change.\n",
" flag = np.concatenate(([True], aux[1:] != aux[:-1]))\n",
"/Applications/anaconda/lib/python2.7/site-packages/numpy/lib/arraysetops.py:251: FutureWarning: numpy equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change.\n",
" return aux[:-1][aux[1:] == aux[:-1]]\n",
"/Applications/anaconda/lib/python2.7/site-packages/numpy/lib/arraysetops.py:384: FutureWarning: numpy equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change.\n",
" bool_ar = (sar[1:] == sar[:-1])\n"
]
}
],
"source": [
"from sklearn import ensemble, preprocessing, cross_validation\n",
"from sklearn.metrics import roc_auc_score as auc\n",
"from time import time\n",
"\n",
"# encode string\n",
"strings = train.dtypes == 'object'; strings = strings[strings].index.tolist(); encoders = {}\n",
"for col in strings:\n",
" encoders[col] = preprocessing.LabelEncoder()\n",
" train[col] = encoders[col].fit_transform(train[col])\n",
" try:\n",
" test[col] = encoders[col].transform(test[col])\n",
" except:\n",
" # lazy way to incorporate the feature only if can be encoded in the test set\n",
" del test[col]\n",
" del train[col]\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# DATA ready\n",
"X = train.drop('target',1).fillna(0); y = train.target\n",
"\n",
"# RF FTW :)\n",
"rf = ensemble.RandomForestClassifier(n_jobs=4, n_estimators = 20, random_state = 11)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TRAIN 1.0000 | TEST 0.7297 | TIME 7.91m (1-fold)\n"
]
}
],
"source": [
"# CROSS VALIDATE AND PRINT TRAIN AND TEST SCORE\n",
"kf = cross_validation.StratifiedKFold(y, n_folds=3, shuffle=True, random_state=11)\n",
"trscores, cvscores, times = [], [], []\n",
"for itr, icv in kf:\n",
" t = time()\n",
" trscore = auc(y.iloc[itr], rf.fit(X.iloc[itr], y.iloc[itr]).predict_proba(X.iloc[itr])[:,1])\n",
" cvscore = auc(y.iloc[icv], rf.predict_proba(X.iloc[icv])[:,1])\n",
" trscores.append(trscore); cvscores.append(cvscore); times.append(time()-t)\n",
"print \"TRAIN %.4f | TEST %.4f | TIME %.2fm (1-fold)\" % (np.mean(trscores), np.mean(cvscores), np.mean(times)/60)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Applications/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1159: DtypeWarning: Columns (8,9,10,11,12,43,157,167,177,196,214,225,228,229,231,235,238) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = self._reader.read(nrows)\n"
]
}
],
"source": [
"# load relevant libraries\n",
"\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import seaborn as sns\n",
"\n",
"%matplotlib inline\n",
"\n",
"# Read in Greenleaf Train.csv train file\n",
"url = \"/Users/Glenn/Documents/GA_Data_Science/Kaggle/SpringLeaf/train.csv\"\n",
"train = pd.read_csv(url).set_index(\"ID\")\n",
"\n",
"# Read in Greenleaf Train.csv teat file\n",
"url = \"/Users/Glenn/Documents/GA_Data_Science/Kaggle/SpringLeaf/test.csv\"\n",
"test = pd.read_csv(url).set_index(\"ID\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>VAR_0002</th>\n",
" <th>VAR_0003</th>\n",
" <th>VAR_0004</th>\n",
" <th>VAR_0006</th>\n",
" <th>VAR_0007</th>\n",
" <th>VAR_0013</th>\n",
" <th>VAR_0014</th>\n",
" <th>VAR_0015</th>\n",
" <th>VAR_0016</th>\n",
" <th>...</th>\n",
" <th>VAR_1925</th>\n",
" <th>VAR_1926</th>\n",
" <th>VAR_1927</th>\n",
" <th>VAR_1928</th>\n",
" <th>VAR_1929</th>\n",
" <th>VAR_1930</th>\n",
" <th>VAR_1931</th>\n",
" <th>VAR_1932</th>\n",
" <th>VAR_1933</th>\n",
" <th>target</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td> 145231.000000</td>\n",
" <td> 145231.000000</td>\n",
" <td> 145231.000000</td>\n",
" <td> 145231.000000</td>\n",
" <td> 145175.000000</td>\n",
" <td> 145175.00000</td>\n",
" <td> 145175.000000</td>\n",
" <td> 145175.000000</td>\n",
" <td> 145175.000000</td>\n",
" <td> 145175.000000</td>\n",
" <td>...</td>\n",
" <td> 145231.000000</td>\n",
" <td> 145231.000000</td>\n",
" <td> 145231.000000</td>\n",
" <td> 145231.000000</td>\n",
" <td> 1.452310e+05</td>\n",
" <td> 145231.000000</td>\n",
" <td> 145231.000000</td>\n",
" <td> 145231.000000</td>\n",
" <td> 145231.000000</td>\n",
" <td> 145231.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td> 145124.981808</td>\n",
" <td> 105.278040</td>\n",
" <td> 88.492285</td>\n",
" <td> 3721.946210</td>\n",
" <td> 0.693907</td>\n",
" <td> 0.51316</td>\n",
" <td> 0.693907</td>\n",
" <td> 0.494493</td>\n",
" <td> 0.476763</td>\n",
" <td> 1.447598</td>\n",
" <td>...</td>\n",
" <td> 0.549965</td>\n",
" <td> 86.664369</td>\n",
" <td> 89.828762</td>\n",
" <td> 914.417259</td>\n",
" <td> 9.904497e+08</td>\n",
" <td> 922.938505</td>\n",
" <td> 936.487905</td>\n",
" <td> 9942.162279</td>\n",
" <td> 8849.839111</td>\n",
" <td> 0.232547</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td> 83890.290627</td>\n",
" <td> 143.351237</td>\n",
" <td> 125.903323</td>\n",
" <td> 12280.012213</td>\n",
" <td> 1.604578</td>\n",
" <td> 1.17531</td>\n",
" <td> 1.604578</td>\n",
" <td> 1.115085</td>\n",
" <td> 1.000755</td>\n",
" <td> 1.132784</td>\n",
" <td>...</td>\n",
" <td> 6.361312</td>\n",
" <td> 31.347700</td>\n",
" <td> 26.928221</td>\n",
" <td> 276.361514</td>\n",
" <td> 9.725818e+07</td>\n",
" <td> 212.087416</td>\n",
" <td> 210.370833</td>\n",
" <td> 743.153332</td>\n",
" <td> 3175.797333</td>\n",
" <td> 0.422457</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td> 2.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.00000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td>...</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000e+00</td>\n",
" <td> 1.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td> 72342.500000</td>\n",
" <td> 24.000000</td>\n",
" <td> 13.000000</td>\n",
" <td> 1790.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.00000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 1.000000</td>\n",
" <td>...</td>\n",
" <td> 0.000000</td>\n",
" <td> 98.000000</td>\n",
" <td> 98.000000</td>\n",
" <td> 998.000000</td>\n",
" <td> 1.000000e+09</td>\n",
" <td> 998.000000</td>\n",
" <td> 998.000000</td>\n",
" <td> 9998.000000</td>\n",
" <td> 9998.000000</td>\n",
" <td> 0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td> 145272.000000</td>\n",
" <td> 60.000000</td>\n",
" <td> 55.000000</td>\n",
" <td> 2500.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.00000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 0.000000</td>\n",
" <td> 1.000000</td>\n",
" <td>...</td>\n",
" <td> 0.000000</td>\n",
" <td> 98.000000</td>\n",
" <td> 98.000000</td>\n",
" <td> 998.000000</td>\n",
" <td> 1.000000e+09</td>\n",
" <td> 998.000000</td>\n",
" <td> 998.000000</td>\n",
" <td> 9998.000000</td>\n",
" <td> 9998.000000</td>\n",
" <td> 0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td> 217686.500000</td>\n",
" <td> 132.000000</td>\n",
" <td> 120.000000</td>\n",
" <td> 3600.000000</td>\n",
" <td> 1.000000</td>\n",
" <td> 1.00000</td>\n",
" <td> 1.000000</td>\n",
" <td> 1.000000</td>\n",
" <td> 1.000000</td>\n",
" <td> 1.000000</td>\n",
" <td>...</td>\n",
" <td> 0.000000</td>\n",
" <td> 98.000000</td>\n",
" <td> 98.000000</td>\n",
" <td> 998.000000</td>\n",
" <td> 1.000000e+09</td>\n",
" <td> 998.000000</td>\n",
" <td> 998.000000</td>\n",
" <td> 9998.000000</td>\n",
" <td> 9998.000000</td>\n",
" <td> 0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td> 290463.000000</td>\n",
" <td> 999.000000</td>\n",
" <td> 999.000000</td>\n",
" <td> 2200000.000000</td>\n",
" <td> 68.000000</td>\n",
" <td> 66.00000</td>\n",
" <td> 68.000000</td>\n",
" <td> 67.000000</td>\n",
" <td> 35.000000</td>\n",
" <td> 39.000000</td>\n",
" <td>...</td>\n",
" <td> 99.000000</td>\n",
" <td> 99.000000</td>\n",
" <td> 99.000000</td>\n",
" <td> 999.000000</td>\n",
" <td> 1.000000e+09</td>\n",
" <td> 999.000000</td>\n",
" <td> 999.000000</td>\n",
" <td> 9999.000000</td>\n",
" <td> 9999.000000</td>\n",
" <td> 1.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>8 rows × 1883 columns</p>\n",
"</div>"
],
"text/plain": [
" ID VAR_0002 VAR_0003 VAR_0004 \\\n",
"count 145231.000000 145231.000000 145231.000000 145231.000000 \n",
"mean 145124.981808 105.278040 88.492285 3721.946210 \n",
"std 83890.290627 143.351237 125.903323 12280.012213 \n",
"min 2.000000 0.000000 0.000000 0.000000 \n",
"25% 72342.500000 24.000000 13.000000 1790.000000 \n",
"50% 145272.000000 60.000000 55.000000 2500.000000 \n",
"75% 217686.500000 132.000000 120.000000 3600.000000 \n",
"max 290463.000000 999.000000 999.000000 2200000.000000 \n",
"\n",
" VAR_0006 VAR_0007 VAR_0013 VAR_0014 \\\n",
"count 145175.000000 145175.00000 145175.000000 145175.000000 \n",
"mean 0.693907 0.51316 0.693907 0.494493 \n",
"std 1.604578 1.17531 1.604578 1.115085 \n",
"min 0.000000 0.00000 0.000000 0.000000 \n",
"25% 0.000000 0.00000 0.000000 0.000000 \n",
"50% 0.000000 0.00000 0.000000 0.000000 \n",
"75% 1.000000 1.00000 1.000000 1.000000 \n",
"max 68.000000 66.00000 68.000000 67.000000 \n",
"\n",
" VAR_0015 VAR_0016 ... VAR_1925 \\\n",
"count 145175.000000 145175.000000 ... 145231.000000 \n",
"mean 0.476763 1.447598 ... 0.549965 \n",
"std 1.000755 1.132784 ... 6.361312 \n",
"min 0.000000 0.000000 ... 0.000000 \n",
"25% 0.000000 1.000000 ... 0.000000 \n",
"50% 0.000000 1.000000 ... 0.000000 \n",
"75% 1.000000 1.000000 ... 0.000000 \n",
"max 35.000000 39.000000 ... 99.000000 \n",
"\n",
" VAR_1926 VAR_1927 VAR_1928 VAR_1929 \\\n",
"count 145231.000000 145231.000000 145231.000000 1.452310e+05 \n",
"mean 86.664369 89.828762 914.417259 9.904497e+08 \n",
"std 31.347700 26.928221 276.361514 9.725818e+07 \n",
"min 0.000000 0.000000 0.000000 0.000000e+00 \n",
"25% 98.000000 98.000000 998.000000 1.000000e+09 \n",
"50% 98.000000 98.000000 998.000000 1.000000e+09 \n",
"75% 98.000000 98.000000 998.000000 1.000000e+09 \n",
"max 99.000000 99.000000 999.000000 1.000000e+09 \n",
"\n",
" VAR_1930 VAR_1931 VAR_1932 VAR_1933 \\\n",
"count 145231.000000 145231.000000 145231.000000 145231.000000 \n",
"mean 922.938505 936.487905 9942.162279 8849.839111 \n",
"std 212.087416 210.370833 743.153332 3175.797333 \n",
"min 1.000000 0.000000 0.000000 0.000000 \n",
"25% 998.000000 998.000000 9998.000000 9998.000000 \n",
"50% 998.000000 998.000000 9998.000000 9998.000000 \n",
"75% 998.000000 998.000000 9998.000000 9998.000000 \n",
"max 999.000000 999.000000 9999.000000 9999.000000 \n",
"\n",
" target \n",
"count 145231.000000 \n",
"mean 0.232547 \n",
"std 0.422457 \n",
"min 0.000000 \n",
"25% 0.000000 \n",
"50% 0.000000 \n",
"75% 0.000000 \n",
"max 1.000000 \n",
"\n",
"[8 rows x 1883 columns]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# lcd.describe()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# url1 = \"/Users/Glenn/Documents/GA_Data_Science/Kaggle/GreenLeaf/train_describe.txt\"\n",
"# f = open(url1, 'w')\n",
"# f.write(str(lcd.describe()))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# f.close()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# remove constants\n",
"nunique = pd.Series([train[col].nunique() for col in train.columns], index = train.columns)\n",
"constants = nunique[nunique<2].index.tolist()\n",
"train = train.drop(constants,axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Applications/anaconda/lib/python2.7/site-packages/numpy/lib/arraysetops.py:198: FutureWarning: numpy not_equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change.\n",
" flag = np.concatenate(([True], aux[1:] != aux[:-1]))\n",
"/Applications/anaconda/lib/python2.7/site-packages/numpy/lib/arraysetops.py:251: FutureWarning: numpy equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change.\n",
" return aux[:-1][aux[1:] == aux[:-1]]\n",
"/Applications/anaconda/lib/python2.7/site-packages/numpy/lib/arraysetops.py:384: FutureWarning: numpy equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change.\n",
" bool_ar = (sar[1:] == sar[:-1])\n"
]
}
],
"source": [
"from sklearn import ensemble, preprocessing, cross_validation\n",
"from sklearn.metrics import roc_auc_score as auc\n",
"from time import time\n",
"\n",
"# encode string\n",
"strings = train.dtypes == 'object'; strings = strings[strings].index.tolist(); encoders = {}\n",
"for col in strings:\n",
" encoders[col] = preprocessing.LabelEncoder()\n",
" train[col] = encoders[col].fit_transform(train[col])\n",
" try:\n",
" test[col] = encoders[col].transform(test[col])\n",
" except:\n",
" # lazy way to incorporate the feature only if can be encoded in the test set\n",
" del test[col]\n",
" del train[col]\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# DATA ready\n",
"X = train.drop('target',1).fillna(0); y = train.target\n",
"\n",
"# RF FTW :)\n",
"rf = ensemble.RandomForestClassifier(n_jobs=4, n_estimators = 20, random_state = 11)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TRAIN 1.0000 | TEST 0.7297 | TIME 7.91m (1-fold)\n"
]
}
],
"source": [
"# CROSS VALIDATE AND PRINT TRAIN AND TEST SCORE\n",
"kf = cross_validation.StratifiedKFold(y, n_folds=3, shuffle=True, random_state=11)\n",
"trscores, cvscores, times = [], [], []\n",
"for itr, icv in kf:\n",
" t = time()\n",
" trscore = auc(y.iloc[itr], rf.fit(X.iloc[itr], y.iloc[itr]).predict_proba(X.iloc[itr])[:,1])\n",
" cvscore = auc(y.iloc[icv], rf.predict_proba(X.iloc[icv])[:,1])\n",
" trscores.append(trscore); cvscores.append(cvscore); times.append(time()-t)\n",
"print \"TRAIN %.4f | TEST %.4f | TIME %.2fm (1-fold)\" % (np.mean(trscores), np.mean(cvscores), np.mean(times)/60)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment