Vikrant79/Coupon Purchase Prediction - BTB Script.ipynb

## Coupon Purchase Prediction - BTB Script.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Coupon Purchase Prediction - BTB Script.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## Coupon Purchase Prediction - First Script.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Coupon Purchase Prediction - First Script.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## Flavor of Physics - Classification.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Flavor of Physics - Classification.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## Flavor of Physics - Mix of models.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Flavor of Physics - Mix of models.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## Springleaf - Experiments with Classification.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Springleaf - Experiments with Classification.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## Springleaf - Experiments with Random Forest.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Springleaf - Experiments with Random Forest.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## SpringLeaf -Kaggle 18AUG15 (1).ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Applications/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1159: DtypeWarning: Columns (8,9,10,11,12,43,157,167,177,196,214,225,228,229,231,235,238) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  data = self._reader.read(nrows)\n"
     ]
    }
   ],
   "source": [
    "# load relevant libraries\n",
    "\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "\n",
    "%matplotlib inline\n",
    "\n",
    "# Read in Greenleaf Train.csv train file\n",
    "url = \"/Users/Glenn/Documents/GA_Data_Science/Kaggle/SpringLeaf/train.csv\"\n",
    "train = pd.read_csv(url).set_index(\"ID\")\n",
    "\n",
    "# Read in Greenleaf Train.csv teat file\n",
    "url = \"/Users/Glenn/Documents/GA_Data_Science/Kaggle/SpringLeaf/test.csv\"\n",
    "test = pd.read_csv(url).set_index(\"ID\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>VAR_0002</th>\n",
       "      <th>VAR_0003</th>\n",
       "      <th>VAR_0004</th>\n",
       "      <th>VAR_0006</th>\n",
       "      <th>VAR_0007</th>\n",
       "      <th>VAR_0013</th>\n",
       "      <th>VAR_0014</th>\n",
       "      <th>VAR_0015</th>\n",
       "      <th>VAR_0016</th>\n",
       "      <th>...</th>\n",
       "      <th>VAR_1925</th>\n",
       "      <th>VAR_1926</th>\n",
       "      <th>VAR_1927</th>\n",
       "      <th>VAR_1928</th>\n",
       "      <th>VAR_1929</th>\n",
       "      <th>VAR_1930</th>\n",
       "      <th>VAR_1931</th>\n",
       "      <th>VAR_1932</th>\n",
       "      <th>VAR_1933</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td> 145231.000000</td>\n",
       "      <td> 145231.000000</td>\n",
       "      <td> 145231.000000</td>\n",
       "      <td>  145231.000000</td>\n",
       "      <td> 145175.000000</td>\n",
       "      <td> 145175.00000</td>\n",
       "      <td> 145175.000000</td>\n",
       "      <td> 145175.000000</td>\n",
       "      <td> 145175.000000</td>\n",
       "      <td> 145175.000000</td>\n",
       "      <td>...</td>\n",
       "      <td> 145231.000000</td>\n",
       "      <td> 145231.000000</td>\n",
       "      <td> 145231.000000</td>\n",
       "      <td> 145231.000000</td>\n",
       "      <td> 1.452310e+05</td>\n",
       "      <td> 145231.000000</td>\n",
       "      <td> 145231.000000</td>\n",
       "      <td> 145231.000000</td>\n",
       "      <td> 145231.000000</td>\n",
       "      <td> 145231.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td> 145124.981808</td>\n",
       "      <td>    105.278040</td>\n",
       "      <td>     88.492285</td>\n",
       "      <td>    3721.946210</td>\n",
       "      <td>      0.693907</td>\n",
       "      <td>      0.51316</td>\n",
       "      <td>      0.693907</td>\n",
       "      <td>      0.494493</td>\n",
       "      <td>      0.476763</td>\n",
       "      <td>      1.447598</td>\n",
       "      <td>...</td>\n",
       "      <td>      0.549965</td>\n",
       "      <td>     86.664369</td>\n",
       "      <td>     89.828762</td>\n",
       "      <td>    914.417259</td>\n",
       "      <td> 9.904497e+08</td>\n",
       "      <td>    922.938505</td>\n",
       "      <td>    936.487905</td>\n",
       "      <td>   9942.162279</td>\n",
       "      <td>   8849.839111</td>\n",
       "      <td>      0.232547</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>  83890.290627</td>\n",
       "      <td>    143.351237</td>\n",
       "      <td>    125.903323</td>\n",
       "      <td>   12280.012213</td>\n",
       "      <td>      1.604578</td>\n",
       "      <td>      1.17531</td>\n",
       "      <td>      1.604578</td>\n",
       "      <td>      1.115085</td>\n",
       "      <td>      1.000755</td>\n",
       "      <td>      1.132784</td>\n",
       "      <td>...</td>\n",
       "      <td>      6.361312</td>\n",
       "      <td>     31.347700</td>\n",
       "      <td>     26.928221</td>\n",
       "      <td>    276.361514</td>\n",
       "      <td> 9.725818e+07</td>\n",
       "      <td>    212.087416</td>\n",
       "      <td>    210.370833</td>\n",
       "      <td>    743.153332</td>\n",
       "      <td>   3175.797333</td>\n",
       "      <td>      0.422457</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>      2.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>       0.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.00000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td> 0.000000e+00</td>\n",
       "      <td>      1.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>  72342.500000</td>\n",
       "      <td>     24.000000</td>\n",
       "      <td>     13.000000</td>\n",
       "      <td>    1790.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.00000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      1.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>     98.000000</td>\n",
       "      <td>     98.000000</td>\n",
       "      <td>    998.000000</td>\n",
       "      <td> 1.000000e+09</td>\n",
       "      <td>    998.000000</td>\n",
       "      <td>    998.000000</td>\n",
       "      <td>   9998.000000</td>\n",
       "      <td>   9998.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td> 145272.000000</td>\n",
       "      <td>     60.000000</td>\n",
       "      <td>     55.000000</td>\n",
       "      <td>    2500.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.00000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      1.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>     98.000000</td>\n",
       "      <td>     98.000000</td>\n",
       "      <td>    998.000000</td>\n",
       "      <td> 1.000000e+09</td>\n",
       "      <td>    998.000000</td>\n",
       "      <td>    998.000000</td>\n",
       "      <td>   9998.000000</td>\n",
       "      <td>   9998.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td> 217686.500000</td>\n",
       "      <td>    132.000000</td>\n",
       "      <td>    120.000000</td>\n",
       "      <td>    3600.000000</td>\n",
       "      <td>      1.000000</td>\n",
       "      <td>      1.00000</td>\n",
       "      <td>      1.000000</td>\n",
       "      <td>      1.000000</td>\n",
       "      <td>      1.000000</td>\n",
       "      <td>      1.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>     98.000000</td>\n",
       "      <td>     98.000000</td>\n",
       "      <td>    998.000000</td>\n",
       "      <td> 1.000000e+09</td>\n",
       "      <td>    998.000000</td>\n",
       "      <td>    998.000000</td>\n",
       "      <td>   9998.000000</td>\n",
       "      <td>   9998.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td> 290463.000000</td>\n",
       "      <td>    999.000000</td>\n",
       "      <td>    999.000000</td>\n",
       "      <td> 2200000.000000</td>\n",
       "      <td>     68.000000</td>\n",
       "      <td>     66.00000</td>\n",
       "      <td>     68.000000</td>\n",
       "      <td>     67.000000</td>\n",
       "      <td>     35.000000</td>\n",
       "      <td>     39.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>     99.000000</td>\n",
       "      <td>     99.000000</td>\n",
       "      <td>     99.000000</td>\n",
       "      <td>    999.000000</td>\n",
       "      <td> 1.000000e+09</td>\n",
       "      <td>    999.000000</td>\n",
       "      <td>    999.000000</td>\n",
       "      <td>   9999.000000</td>\n",
       "      <td>   9999.000000</td>\n",
       "      <td>      1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>8 rows × 1883 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                  ID       VAR_0002       VAR_0003        VAR_0004  \\\n",
       "count  145231.000000  145231.000000  145231.000000   145231.000000   \n",
       "mean   145124.981808     105.278040      88.492285     3721.946210   \n",
       "std     83890.290627     143.351237     125.903323    12280.012213   \n",
       "min         2.000000       0.000000       0.000000        0.000000   \n",
       "25%     72342.500000      24.000000      13.000000     1790.000000   \n",
       "50%    145272.000000      60.000000      55.000000     2500.000000   \n",
       "75%    217686.500000     132.000000     120.000000     3600.000000   \n",
       "max    290463.000000     999.000000     999.000000  2200000.000000   \n",
       "\n",
       "            VAR_0006      VAR_0007       VAR_0013       VAR_0014  \\\n",
       "count  145175.000000  145175.00000  145175.000000  145175.000000   \n",
       "mean        0.693907       0.51316       0.693907       0.494493   \n",
       "std         1.604578       1.17531       1.604578       1.115085   \n",
       "min         0.000000       0.00000       0.000000       0.000000   \n",
       "25%         0.000000       0.00000       0.000000       0.000000   \n",
       "50%         0.000000       0.00000       0.000000       0.000000   \n",
       "75%         1.000000       1.00000       1.000000       1.000000   \n",
       "max        68.000000      66.00000      68.000000      67.000000   \n",
       "\n",
       "            VAR_0015       VAR_0016      ...             VAR_1925  \\\n",
       "count  145175.000000  145175.000000      ...        145231.000000   \n",
       "mean        0.476763       1.447598      ...             0.549965   \n",
       "std         1.000755       1.132784      ...             6.361312   \n",
       "min         0.000000       0.000000      ...             0.000000   \n",
       "25%         0.000000       1.000000      ...             0.000000   \n",
       "50%         0.000000       1.000000      ...             0.000000   \n",
       "75%         1.000000       1.000000      ...             0.000000   \n",
       "max        35.000000      39.000000      ...            99.000000   \n",
       "\n",
       "            VAR_1926       VAR_1927       VAR_1928      VAR_1929  \\\n",
       "count  145231.000000  145231.000000  145231.000000  1.452310e+05   \n",
       "mean       86.664369      89.828762     914.417259  9.904497e+08   \n",
       "std        31.347700      26.928221     276.361514  9.725818e+07   \n",
       "min         0.000000       0.000000       0.000000  0.000000e+00   \n",
       "25%        98.000000      98.000000     998.000000  1.000000e+09   \n",
       "50%        98.000000      98.000000     998.000000  1.000000e+09   \n",
       "75%        98.000000      98.000000     998.000000  1.000000e+09   \n",
       "max        99.000000      99.000000     999.000000  1.000000e+09   \n",
       "\n",
       "            VAR_1930       VAR_1931       VAR_1932       VAR_1933  \\\n",
       "count  145231.000000  145231.000000  145231.000000  145231.000000   \n",
       "mean      922.938505     936.487905    9942.162279    8849.839111   \n",
       "std       212.087416     210.370833     743.153332    3175.797333   \n",
       "min         1.000000       0.000000       0.000000       0.000000   \n",
       "25%       998.000000     998.000000    9998.000000    9998.000000   \n",
       "50%       998.000000     998.000000    9998.000000    9998.000000   \n",
       "75%       998.000000     998.000000    9998.000000    9998.000000   \n",
       "max       999.000000     999.000000    9999.000000    9999.000000   \n",
       "\n",
       "              target  \n",
       "count  145231.000000  \n",
       "mean        0.232547  \n",
       "std         0.422457  \n",
       "min         0.000000  \n",
       "25%         0.000000  \n",
       "50%         0.000000  \n",
       "75%         0.000000  \n",
       "max         1.000000  \n",
       "\n",
       "[8 rows x 1883 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# lcd.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# url1 = \"/Users/Glenn/Documents/GA_Data_Science/Kaggle/GreenLeaf/train_describe.txt\"\n",
    "# f = open(url1, 'w')\n",
    "# f.write(str(lcd.describe()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# remove constants\n",
    "nunique = pd.Series([train[col].nunique() for col in train.columns], index = train.columns)\n",
    "constants = nunique[nunique<2].index.tolist()\n",
    "train = train.drop(constants,axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Applications/anaconda/lib/python2.7/site-packages/numpy/lib/arraysetops.py:198: FutureWarning: numpy not_equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change.\n",
      "  flag = np.concatenate(([True], aux[1:] != aux[:-1]))\n",
      "/Applications/anaconda/lib/python2.7/site-packages/numpy/lib/arraysetops.py:251: FutureWarning: numpy equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change.\n",
      "  return aux[:-1][aux[1:] == aux[:-1]]\n",
      "/Applications/anaconda/lib/python2.7/site-packages/numpy/lib/arraysetops.py:384: FutureWarning: numpy equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change.\n",
      "  bool_ar = (sar[1:] == sar[:-1])\n"
     ]
    }
   ],
   "source": [
    "from sklearn import ensemble, preprocessing, cross_validation\n",
    "from sklearn.metrics import roc_auc_score as auc\n",
    "from time import time\n",
    "\n",
    "# encode string\n",
    "strings = train.dtypes == 'object'; strings = strings[strings].index.tolist(); encoders = {}\n",
    "for col in strings:\n",
    "    encoders[col] = preprocessing.LabelEncoder()\n",
    "    train[col] = encoders[col].fit_transform(train[col])\n",
    "    try:\n",
    "        test[col] = encoders[col].transform(test[col])\n",
    "    except:\n",
    "        # lazy way to incorporate the feature only if can be encoded in the test set\n",
    "        del test[col]\n",
    "        del train[col]\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# DATA ready\n",
    "X = train.drop('target',1).fillna(0); y = train.target\n",
    "\n",
    "# RF FTW :)\n",
    "rf = ensemble.RandomForestClassifier(n_jobs=4, n_estimators = 20, random_state = 11)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TRAIN 1.0000 | TEST 0.7297 | TIME 7.91m (1-fold)\n"
     ]
    }
   ],
   "source": [
    "# CROSS VALIDATE AND PRINT TRAIN AND TEST SCORE\n",
    "kf = cross_validation.StratifiedKFold(y, n_folds=3, shuffle=True, random_state=11)\n",
    "trscores, cvscores, times = [], [], []\n",
    "for itr, icv in kf:\n",
    "    t = time()\n",
    "    trscore = auc(y.iloc[itr], rf.fit(X.iloc[itr], y.iloc[itr]).predict_proba(X.iloc[itr])[:,1])\n",
    "    cvscore = auc(y.iloc[icv], rf.predict_proba(X.iloc[icv])[:,1])\n",
    "    trscores.append(trscore); cvscores.append(cvscore); times.append(time()-t)\n",
    "print \"TRAIN %.4f | TEST %.4f | TIME %.2fm (1-fold)\" % (np.mean(trscores), np.mean(cvscores), np.mean(times)/60)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}

## SpringLeaf -Kaggle 18AUG15.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Applications/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1159: DtypeWarning: Columns (8,9,10,11,12,43,157,167,177,196,214,225,228,229,231,235,238) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  data = self._reader.read(nrows)\n"
     ]
    }
   ],
   "source": [
    "# load relevant libraries\n",
    "\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "\n",
    "%matplotlib inline\n",
    "\n",
    "# Read in Greenleaf Train.csv train file\n",
    "url = \"/Users/Glenn/Documents/GA_Data_Science/Kaggle/SpringLeaf/train.csv\"\n",
    "train = pd.read_csv(url).set_index(\"ID\")\n",
    "\n",
    "# Read in Greenleaf Train.csv teat file\n",
    "url = \"/Users/Glenn/Documents/GA_Data_Science/Kaggle/SpringLeaf/test.csv\"\n",
    "test = pd.read_csv(url).set_index(\"ID\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>VAR_0002</th>\n",
       "      <th>VAR_0003</th>\n",
       "      <th>VAR_0004</th>\n",
       "      <th>VAR_0006</th>\n",
       "      <th>VAR_0007</th>\n",
       "      <th>VAR_0013</th>\n",
       "      <th>VAR_0014</th>\n",
       "      <th>VAR_0015</th>\n",
       "      <th>VAR_0016</th>\n",
       "      <th>...</th>\n",
       "      <th>VAR_1925</th>\n",
       "      <th>VAR_1926</th>\n",
       "      <th>VAR_1927</th>\n",
       "      <th>VAR_1928</th>\n",
       "      <th>VAR_1929</th>\n",
       "      <th>VAR_1930</th>\n",
       "      <th>VAR_1931</th>\n",
       "      <th>VAR_1932</th>\n",
       "      <th>VAR_1933</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td> 145231.000000</td>\n",
       "      <td> 145231.000000</td>\n",
       "      <td> 145231.000000</td>\n",
       "      <td>  145231.000000</td>\n",
       "      <td> 145175.000000</td>\n",
       "      <td> 145175.00000</td>\n",
       "      <td> 145175.000000</td>\n",
       "      <td> 145175.000000</td>\n",
       "      <td> 145175.000000</td>\n",
       "      <td> 145175.000000</td>\n",
       "      <td>...</td>\n",
       "      <td> 145231.000000</td>\n",
       "      <td> 145231.000000</td>\n",
       "      <td> 145231.000000</td>\n",
       "      <td> 145231.000000</td>\n",
       "      <td> 1.452310e+05</td>\n",
       "      <td> 145231.000000</td>\n",
       "      <td> 145231.000000</td>\n",
       "      <td> 145231.000000</td>\n",
       "      <td> 145231.000000</td>\n",
       "      <td> 145231.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td> 145124.981808</td>\n",
       "      <td>    105.278040</td>\n",
       "      <td>     88.492285</td>\n",
       "      <td>    3721.946210</td>\n",
       "      <td>      0.693907</td>\n",
       "      <td>      0.51316</td>\n",
       "      <td>      0.693907</td>\n",
       "      <td>      0.494493</td>\n",
       "      <td>      0.476763</td>\n",
       "      <td>      1.447598</td>\n",
       "      <td>...</td>\n",
       "      <td>      0.549965</td>\n",
       "      <td>     86.664369</td>\n",
       "      <td>     89.828762</td>\n",
       "      <td>    914.417259</td>\n",
       "      <td> 9.904497e+08</td>\n",
       "      <td>    922.938505</td>\n",
       "      <td>    936.487905</td>\n",
       "      <td>   9942.162279</td>\n",
       "      <td>   8849.839111</td>\n",
       "      <td>      0.232547</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>  83890.290627</td>\n",
       "      <td>    143.351237</td>\n",
       "      <td>    125.903323</td>\n",
       "      <td>   12280.012213</td>\n",
       "      <td>      1.604578</td>\n",
       "      <td>      1.17531</td>\n",
       "      <td>      1.604578</td>\n",
       "      <td>      1.115085</td>\n",
       "      <td>      1.000755</td>\n",
       "      <td>      1.132784</td>\n",
       "      <td>...</td>\n",
       "      <td>      6.361312</td>\n",
       "      <td>     31.347700</td>\n",
       "      <td>     26.928221</td>\n",
       "      <td>    276.361514</td>\n",
       "      <td> 9.725818e+07</td>\n",
       "      <td>    212.087416</td>\n",
       "      <td>    210.370833</td>\n",
       "      <td>    743.153332</td>\n",
       "      <td>   3175.797333</td>\n",
       "      <td>      0.422457</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>      2.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>       0.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.00000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td> 0.000000e+00</td>\n",
       "      <td>      1.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>  72342.500000</td>\n",
       "      <td>     24.000000</td>\n",
       "      <td>     13.000000</td>\n",
       "      <td>    1790.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.00000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      1.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>     98.000000</td>\n",
       "      <td>     98.000000</td>\n",
       "      <td>    998.000000</td>\n",
       "      <td> 1.000000e+09</td>\n",
       "      <td>    998.000000</td>\n",
       "      <td>    998.000000</td>\n",
       "      <td>   9998.000000</td>\n",
       "      <td>   9998.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td> 145272.000000</td>\n",
       "      <td>     60.000000</td>\n",
       "      <td>     55.000000</td>\n",
       "      <td>    2500.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.00000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>      1.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>     98.000000</td>\n",
       "      <td>     98.000000</td>\n",
       "      <td>    998.000000</td>\n",
       "      <td> 1.000000e+09</td>\n",
       "      <td>    998.000000</td>\n",
       "      <td>    998.000000</td>\n",
       "      <td>   9998.000000</td>\n",
       "      <td>   9998.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td> 217686.500000</td>\n",
       "      <td>    132.000000</td>\n",
       "      <td>    120.000000</td>\n",
       "      <td>    3600.000000</td>\n",
       "      <td>      1.000000</td>\n",
       "      <td>      1.00000</td>\n",
       "      <td>      1.000000</td>\n",
       "      <td>      1.000000</td>\n",
       "      <td>      1.000000</td>\n",
       "      <td>      1.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>      0.000000</td>\n",
       "      <td>     98.000000</td>\n",
       "      <td>     98.000000</td>\n",
       "      <td>    998.000000</td>\n",
       "      <td> 1.000000e+09</td>\n",
       "      <td>    998.000000</td>\n",
       "      <td>    998.000000</td>\n",
       "      <td>   9998.000000</td>\n",
       "      <td>   9998.000000</td>\n",
       "      <td>      0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td> 290463.000000</td>\n",
       "      <td>    999.000000</td>\n",
       "      <td>    999.000000</td>\n",
       "      <td> 2200000.000000</td>\n",
       "      <td>     68.000000</td>\n",
       "      <td>     66.00000</td>\n",
       "      <td>     68.000000</td>\n",
       "      <td>     67.000000</td>\n",
       "      <td>     35.000000</td>\n",
       "      <td>     39.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>     99.000000</td>\n",
       "      <td>     99.000000</td>\n",
       "      <td>     99.000000</td>\n",
       "      <td>    999.000000</td>\n",
       "      <td> 1.000000e+09</td>\n",
       "      <td>    999.000000</td>\n",
       "      <td>    999.000000</td>\n",
       "      <td>   9999.000000</td>\n",
       "      <td>   9999.000000</td>\n",
       "      <td>      1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>8 rows × 1883 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                  ID       VAR_0002       VAR_0003        VAR_0004  \\\n",
       "count  145231.000000  145231.000000  145231.000000   145231.000000   \n",
       "mean   145124.981808     105.278040      88.492285     3721.946210   \n",
       "std     83890.290627     143.351237     125.903323    12280.012213   \n",
       "min         2.000000       0.000000       0.000000        0.000000   \n",
       "25%     72342.500000      24.000000      13.000000     1790.000000   \n",
       "50%    145272.000000      60.000000      55.000000     2500.000000   \n",
       "75%    217686.500000     132.000000     120.000000     3600.000000   \n",
       "max    290463.000000     999.000000     999.000000  2200000.000000   \n",
       "\n",
       "            VAR_0006      VAR_0007       VAR_0013       VAR_0014  \\\n",
       "count  145175.000000  145175.00000  145175.000000  145175.000000   \n",
       "mean        0.693907       0.51316       0.693907       0.494493   \n",
       "std         1.604578       1.17531       1.604578       1.115085   \n",
       "min         0.000000       0.00000       0.000000       0.000000   \n",
       "25%         0.000000       0.00000       0.000000       0.000000   \n",
       "50%         0.000000       0.00000       0.000000       0.000000   \n",
       "75%         1.000000       1.00000       1.000000       1.000000   \n",
       "max        68.000000      66.00000      68.000000      67.000000   \n",
       "\n",
       "            VAR_0015       VAR_0016      ...             VAR_1925  \\\n",
       "count  145175.000000  145175.000000      ...        145231.000000   \n",
       "mean        0.476763       1.447598      ...             0.549965   \n",
       "std         1.000755       1.132784      ...             6.361312   \n",
       "min         0.000000       0.000000      ...             0.000000   \n",
       "25%         0.000000       1.000000      ...             0.000000   \n",
       "50%         0.000000       1.000000      ...             0.000000   \n",
       "75%         1.000000       1.000000      ...             0.000000   \n",
       "max        35.000000      39.000000      ...            99.000000   \n",
       "\n",
       "            VAR_1926       VAR_1927       VAR_1928      VAR_1929  \\\n",
       "count  145231.000000  145231.000000  145231.000000  1.452310e+05   \n",
       "mean       86.664369      89.828762     914.417259  9.904497e+08   \n",
       "std        31.347700      26.928221     276.361514  9.725818e+07   \n",
       "min         0.000000       0.000000       0.000000  0.000000e+00   \n",
       "25%        98.000000      98.000000     998.000000  1.000000e+09   \n",
       "50%        98.000000      98.000000     998.000000  1.000000e+09   \n",
       "75%        98.000000      98.000000     998.000000  1.000000e+09   \n",
       "max        99.000000      99.000000     999.000000  1.000000e+09   \n",
       "\n",
       "            VAR_1930       VAR_1931       VAR_1932       VAR_1933  \\\n",
       "count  145231.000000  145231.000000  145231.000000  145231.000000   \n",
       "mean      922.938505     936.487905    9942.162279    8849.839111   \n",
       "std       212.087416     210.370833     743.153332    3175.797333   \n",
       "min         1.000000       0.000000       0.000000       0.000000   \n",
       "25%       998.000000     998.000000    9998.000000    9998.000000   \n",
       "50%       998.000000     998.000000    9998.000000    9998.000000   \n",
       "75%       998.000000     998.000000    9998.000000    9998.000000   \n",
       "max       999.000000     999.000000    9999.000000    9999.000000   \n",
       "\n",
       "              target  \n",
       "count  145231.000000  \n",
       "mean        0.232547  \n",
       "std         0.422457  \n",
       "min         0.000000  \n",
       "25%         0.000000  \n",
       "50%         0.000000  \n",
       "75%         0.000000  \n",
       "max         1.000000  \n",
       "\n",
       "[8 rows x 1883 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# lcd.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# url1 = \"/Users/Glenn/Documents/GA_Data_Science/Kaggle/GreenLeaf/train_describe.txt\"\n",
    "# f = open(url1, 'w')\n",
    "# f.write(str(lcd.describe()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# remove constants\n",
    "nunique = pd.Series([train[col].nunique() for col in train.columns], index = train.columns)\n",
    "constants = nunique[nunique<2].index.tolist()\n",
    "train = train.drop(constants,axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Applications/anaconda/lib/python2.7/site-packages/numpy/lib/arraysetops.py:198: FutureWarning: numpy not_equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change.\n",
      "  flag = np.concatenate(([True], aux[1:] != aux[:-1]))\n",
      "/Applications/anaconda/lib/python2.7/site-packages/numpy/lib/arraysetops.py:251: FutureWarning: numpy equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change.\n",
      "  return aux[:-1][aux[1:] == aux[:-1]]\n",
      "/Applications/anaconda/lib/python2.7/site-packages/numpy/lib/arraysetops.py:384: FutureWarning: numpy equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change.\n",
      "  bool_ar = (sar[1:] == sar[:-1])\n"
     ]
    }
   ],
   "source": [
    "from sklearn import ensemble, preprocessing, cross_validation\n",
    "from sklearn.metrics import roc_auc_score as auc\n",
    "from time import time\n",
    "\n",
    "# encode string\n",
    "strings = train.dtypes == 'object'; strings = strings[strings].index.tolist(); encoders = {}\n",
    "for col in strings:\n",
    "    encoders[col] = preprocessing.LabelEncoder()\n",
    "    train[col] = encoders[col].fit_transform(train[col])\n",
    "    try:\n",
    "        test[col] = encoders[col].transform(test[col])\n",
    "    except:\n",
    "        # lazy way to incorporate the feature only if can be encoded in the test set\n",
    "        del test[col]\n",
    "        del train[col]\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# DATA ready\n",
    "X = train.drop('target',1).fillna(0); y = train.target\n",
    "\n",
    "# RF FTW :)\n",
    "rf = ensemble.RandomForestClassifier(n_jobs=4, n_estimators = 20, random_state = 11)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TRAIN 1.0000 | TEST 0.7297 | TIME 7.91m (1-fold)\n"
     ]
    }
   ],
   "source": [
    "# CROSS VALIDATE AND PRINT TRAIN AND TEST SCORE\n",
    "kf = cross_validation.StratifiedKFold(y, n_folds=3, shuffle=True, random_state=11)\n",
    "trscores, cvscores, times = [], [], []\n",
    "for itr, icv in kf:\n",
    "    t = time()\n",
    "    trscore = auc(y.iloc[itr], rf.fit(X.iloc[itr], y.iloc[itr]).predict_proba(X.iloc[itr])[:,1])\n",
    "    cvscore = auc(y.iloc[icv], rf.predict_proba(X.iloc[icv])[:,1])\n",
    "    trscores.append(trscore); cvscores.append(cvscore); times.append(time()-t)\n",
    "print \"TRAIN %.4f | TEST %.4f | TIME %.2fm (1-fold)\" % (np.mean(trscores), np.mean(cvscores), np.mean(times)/60)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}

## Springleaf with xgb.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Springleaf with xgb.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## Springleaf with xgb1.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Springleaf with xgb1.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/Applications/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1159: DtypeWarning: Columns (8,9,10,11,12,43,157,167,177,196,214,225,228,229,231,235,238) have mixed types. Specify dtype option on import or set low_memory=False.\n",
	" data = self._reader.read(nrows)\n"
	]
	}
	],
	"source": [
	"# load relevant libraries\n",
	"\n",
	"import pandas as pd\n",
	"import matplotlib.pyplot as plt\n",
	"import numpy as np\n",
	"import seaborn as sns\n",
	"\n",
	"%matplotlib inline\n",
	"\n",
	"# Read in Greenleaf Train.csv train file\n",
	"url = \"/Users/Glenn/Documents/GA_Data_Science/Kaggle/SpringLeaf/train.csv\"\n",
	"train = pd.read_csv(url).set_index(\"ID\")\n",
	"\n",
	"# Read in Greenleaf Train.csv teat file\n",
	"url = \"/Users/Glenn/Documents/GA_Data_Science/Kaggle/SpringLeaf/test.csv\"\n",
	"test = pd.read_csv(url).set_index(\"ID\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>ID</th>\n",
	" <th>VAR_0002</th>\n",
	" <th>VAR_0003</th>\n",
	" <th>VAR_0004</th>\n",
	" <th>VAR_0006</th>\n",
	" <th>VAR_0007</th>\n",
	" <th>VAR_0013</th>\n",
	" <th>VAR_0014</th>\n",
	" <th>VAR_0015</th>\n",
	" <th>VAR_0016</th>\n",
	" <th>...</th>\n",
	" <th>VAR_1925</th>\n",
	" <th>VAR_1926</th>\n",
	" <th>VAR_1927</th>\n",
	" <th>VAR_1928</th>\n",
	" <th>VAR_1929</th>\n",
	" <th>VAR_1930</th>\n",
	" <th>VAR_1931</th>\n",
	" <th>VAR_1932</th>\n",
	" <th>VAR_1933</th>\n",
	" <th>target</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>count</th>\n",
	" <td> 145231.000000</td>\n",
	" <td> 145231.000000</td>\n",
	" <td> 145231.000000</td>\n",
	" <td> 145231.000000</td>\n",
	" <td> 145175.000000</td>\n",
	" <td> 145175.00000</td>\n",
	" <td> 145175.000000</td>\n",
	" <td> 145175.000000</td>\n",
	" <td> 145175.000000</td>\n",
	" <td> 145175.000000</td>\n",
	" <td>...</td>\n",
	" <td> 145231.000000</td>\n",
	" <td> 145231.000000</td>\n",
	" <td> 145231.000000</td>\n",
	" <td> 145231.000000</td>\n",
	" <td> 1.452310e+05</td>\n",
	" <td> 145231.000000</td>\n",
	" <td> 145231.000000</td>\n",
	" <td> 145231.000000</td>\n",
	" <td> 145231.000000</td>\n",
	" <td> 145231.000000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>mean</th>\n",
	" <td> 145124.981808</td>\n",
	" <td> 105.278040</td>\n",
	" <td> 88.492285</td>\n",
	" <td> 3721.946210</td>\n",
	" <td> 0.693907</td>\n",
	" <td> 0.51316</td>\n",
	" <td> 0.693907</td>\n",
	" <td> 0.494493</td>\n",
	" <td> 0.476763</td>\n",
	" <td> 1.447598</td>\n",
	" <td>...</td>\n",
	" <td> 0.549965</td>\n",
	" <td> 86.664369</td>\n",
	" <td> 89.828762</td>\n",
	" <td> 914.417259</td>\n",
	" <td> 9.904497e+08</td>\n",
	" <td> 922.938505</td>\n",
	" <td> 936.487905</td>\n",
	" <td> 9942.162279</td>\n",
	" <td> 8849.839111</td>\n",
	" <td> 0.232547</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>std</th>\n",
	" <td> 83890.290627</td>\n",
	" <td> 143.351237</td>\n",
	" <td> 125.903323</td>\n",
	" <td> 12280.012213</td>\n",
	" <td> 1.604578</td>\n",
	" <td> 1.17531</td>\n",
	" <td> 1.604578</td>\n",
	" <td> 1.115085</td>\n",
	" <td> 1.000755</td>\n",
	" <td> 1.132784</td>\n",
	" <td>...</td>\n",
	" <td> 6.361312</td>\n",
	" <td> 31.347700</td>\n",
	" <td> 26.928221</td>\n",
	" <td> 276.361514</td>\n",
	" <td> 9.725818e+07</td>\n",
	" <td> 212.087416</td>\n",
	" <td> 210.370833</td>\n",
	" <td> 743.153332</td>\n",
	" <td> 3175.797333</td>\n",
	" <td> 0.422457</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>min</th>\n",
	" <td> 2.000000</td>\n",
	" <td> 0.000000</td>\n",
	" <td> 0.000000</td>\n",
	" <td> 0.000000</td>\n",
	" <td> 0.000000</td>\n",
	" <td> 0.00000</td>\n",
	" <td> 0.000000</td>\n",
	" <td> 0.000000</td>\n",
	" <td> 0.000000</td>\n",
	" <td> 0.000000</td>\n",
	" <td>...</td>\n",
	" <td> 0.000000</td>\n",
	" <td> 0.000000</td>\n",
	" <td> 0.000000</td>\n",
	" <td> 0.000000</td>\n",
	" <td> 0.000000e+00</td>\n",
	" <td> 1.000000</td>\n",
	" <td> 0.000000</td>\n",
	" <td> 0.000000</td>\n",
	" <td> 0.000000</td>\n",
	" <td> 0.000000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>25%</th>\n",
	" <td> 72342.500000</td>\n",
	" <td> 24.000000</td>\n",
	" <td> 13.000000</td>\n",
	" <td> 1790.000000</td>\n",
	" <td> 0.000000</td>\n",
	" <td> 0.00000</td>\n",
	" <td> 0.000000</td>\n",
	" <td> 0.000000</td>\n",
	" <td> 0.000000</td>\n",
	" <td> 1.000000</td>\n",
	" <td>...</td>\n",
	" <td> 0.000000</td>\n",
	" <td> 98.000000</td>\n",
	" <td> 98.000000</td>\n",
	" <td> 998.000000</td>\n",
	" <td> 1.000000e+09</td>\n",
	" <td> 998.000000</td>\n",
	" <td> 998.000000</td>\n",
	" <td> 9998.000000</td>\n",
	" <td> 9998.000000</td>\n",
	" <td> 0.000000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>50%</th>\n",
	" <td> 145272.000000</td>\n",
	" <td> 60.000000</td>\n",
	" <td> 55.000000</td>\n",
	" <td> 2500.000000</td>\n",
	" <td> 0.000000</td>\n",
	" <td> 0.00000</td>\n",
	" <td> 0.000000</td>\n",
	" <td> 0.000000</td>\n",
	" <td> 0.000000</td>\n",
	" <td> 1.000000</td>\n",
	" <td>...</td>\n",
	" <td> 0.000000</td>\n",
	" <td> 98.000000</td>\n",
	" <td> 98.000000</td>\n",
	" <td> 998.000000</td>\n",
	" <td> 1.000000e+09</td>\n",
	" <td> 998.000000</td>\n",
	" <td> 998.000000</td>\n",
	" <td> 9998.000000</td>\n",
	" <td> 9998.000000</td>\n",
	" <td> 0.000000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>75%</th>\n",
	" <td> 217686.500000</td>\n",
	" <td> 132.000000</td>\n",
	" <td> 120.000000</td>\n",
	" <td> 3600.000000</td>\n",
	" <td> 1.000000</td>\n",
	" <td> 1.00000</td>\n",
	" <td> 1.000000</td>\n",
	" <td> 1.000000</td>\n",
	" <td> 1.000000</td>\n",
	" <td> 1.000000</td>\n",
	" <td>...</td>\n",
	" <td> 0.000000</td>\n",
	" <td> 98.000000</td>\n",
	" <td> 98.000000</td>\n",
	" <td> 998.000000</td>\n",
	" <td> 1.000000e+09</td>\n",
	" <td> 998.000000</td>\n",
	" <td> 998.000000</td>\n",
	" <td> 9998.000000</td>\n",
	" <td> 9998.000000</td>\n",
	" <td> 0.000000</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>max</th>\n",
	" <td> 290463.000000</td>\n",
	" <td> 999.000000</td>\n",
	" <td> 999.000000</td>\n",
	" <td> 2200000.000000</td>\n",
	" <td> 68.000000</td>\n",
	" <td> 66.00000</td>\n",
	" <td> 68.000000</td>\n",
	" <td> 67.000000</td>\n",
	" <td> 35.000000</td>\n",
	" <td> 39.000000</td>\n",
	" <td>...</td>\n",
	" <td> 99.000000</td>\n",
	" <td> 99.000000</td>\n",
	" <td> 99.000000</td>\n",
	" <td> 999.000000</td>\n",
	" <td> 1.000000e+09</td>\n",
	" <td> 999.000000</td>\n",
	" <td> 999.000000</td>\n",
	" <td> 9999.000000</td>\n",
	" <td> 9999.000000</td>\n",
	" <td> 1.000000</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>8 rows × 1883 columns</p>\n",
	"</div>"
	],
	"text/plain": [
	" ID VAR_0002 VAR_0003 VAR_0004 \\\n",
	"count 145231.000000 145231.000000 145231.000000 145231.000000 \n",
	"mean 145124.981808 105.278040 88.492285 3721.946210 \n",
	"std 83890.290627 143.351237 125.903323 12280.012213 \n",
	"min 2.000000 0.000000 0.000000 0.000000 \n",
	"25% 72342.500000 24.000000 13.000000 1790.000000 \n",
	"50% 145272.000000 60.000000 55.000000 2500.000000 \n",
	"75% 217686.500000 132.000000 120.000000 3600.000000 \n",
	"max 290463.000000 999.000000 999.000000 2200000.000000 \n",
	"\n",
	" VAR_0006 VAR_0007 VAR_0013 VAR_0014 \\\n",
	"count 145175.000000 145175.00000 145175.000000 145175.000000 \n",
	"mean 0.693907 0.51316 0.693907 0.494493 \n",
	"std 1.604578 1.17531 1.604578 1.115085 \n",
	"min 0.000000 0.00000 0.000000 0.000000 \n",
	"25% 0.000000 0.00000 0.000000 0.000000 \n",
	"50% 0.000000 0.00000 0.000000 0.000000 \n",
	"75% 1.000000 1.00000 1.000000 1.000000 \n",
	"max 68.000000 66.00000 68.000000 67.000000 \n",
	"\n",
	" VAR_0015 VAR_0016 ... VAR_1925 \\\n",
	"count 145175.000000 145175.000000 ... 145231.000000 \n",
	"mean 0.476763 1.447598 ... 0.549965 \n",
	"std 1.000755 1.132784 ... 6.361312 \n",
	"min 0.000000 0.000000 ... 0.000000 \n",
	"25% 0.000000 1.000000 ... 0.000000 \n",
	"50% 0.000000 1.000000 ... 0.000000 \n",
	"75% 1.000000 1.000000 ... 0.000000 \n",
	"max 35.000000 39.000000 ... 99.000000 \n",
	"\n",
	" VAR_1926 VAR_1927 VAR_1928 VAR_1929 \\\n",
	"count 145231.000000 145231.000000 145231.000000 1.452310e+05 \n",
	"mean 86.664369 89.828762 914.417259 9.904497e+08 \n",
	"std 31.347700 26.928221 276.361514 9.725818e+07 \n",
	"min 0.000000 0.000000 0.000000 0.000000e+00 \n",
	"25% 98.000000 98.000000 998.000000 1.000000e+09 \n",
	"50% 98.000000 98.000000 998.000000 1.000000e+09 \n",
	"75% 98.000000 98.000000 998.000000 1.000000e+09 \n",
	"max 99.000000 99.000000 999.000000 1.000000e+09 \n",
	"\n",
	" VAR_1930 VAR_1931 VAR_1932 VAR_1933 \\\n",
	"count 145231.000000 145231.000000 145231.000000 145231.000000 \n",
	"mean 922.938505 936.487905 9942.162279 8849.839111 \n",
	"std 212.087416 210.370833 743.153332 3175.797333 \n",
	"min 1.000000 0.000000 0.000000 0.000000 \n",
	"25% 998.000000 998.000000 9998.000000 9998.000000 \n",
	"50% 998.000000 998.000000 9998.000000 9998.000000 \n",
	"75% 998.000000 998.000000 9998.000000 9998.000000 \n",
	"max 999.000000 999.000000 9999.000000 9999.000000 \n",
	"\n",
	" target \n",
	"count 145231.000000 \n",
	"mean 0.232547 \n",
	"std 0.422457 \n",
	"min 0.000000 \n",
	"25% 0.000000 \n",
	"50% 0.000000 \n",
	"75% 0.000000 \n",
	"max 1.000000 \n",
	"\n",
	"[8 rows x 1883 columns]"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# lcd.describe()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# url1 = \"/Users/Glenn/Documents/GA_Data_Science/Kaggle/GreenLeaf/train_describe.txt\"\n",
	"# f = open(url1, 'w')\n",
	"# f.write(str(lcd.describe()))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# f.close()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# remove constants\n",
	"nunique = pd.Series([train[col].nunique() for col in train.columns], index = train.columns)\n",
	"constants = nunique[nunique<2].index.tolist()\n",
	"train = train.drop(constants,axis=1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/Applications/anaconda/lib/python2.7/site-packages/numpy/lib/arraysetops.py:198: FutureWarning: numpy not_equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change.\n",
	" flag = np.concatenate(([True], aux[1:] != aux[:-1]))\n",
	"/Applications/anaconda/lib/python2.7/site-packages/numpy/lib/arraysetops.py:251: FutureWarning: numpy equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change.\n",
	" return aux[:-1][aux[1:] == aux[:-1]]\n",
	"/Applications/anaconda/lib/python2.7/site-packages/numpy/lib/arraysetops.py:384: FutureWarning: numpy equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change.\n",
	" bool_ar = (sar[1:] == sar[:-1])\n"
	]
	}
	],
	"source": [
	"from sklearn import ensemble, preprocessing, cross_validation\n",
	"from sklearn.metrics import roc_auc_score as auc\n",
	"from time import time\n",
	"\n",
	"# encode string\n",
	"strings = train.dtypes == 'object'; strings = strings[strings].index.tolist(); encoders = {}\n",
	"for col in strings:\n",
	" encoders[col] = preprocessing.LabelEncoder()\n",
	" train[col] = encoders[col].fit_transform(train[col])\n",
	" try:\n",
	" test[col] = encoders[col].transform(test[col])\n",
	" except:\n",
	" # lazy way to incorporate the feature only if can be encoded in the test set\n",
	" del test[col]\n",
	" del train[col]\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# DATA ready\n",
	"X = train.drop('target',1).fillna(0); y = train.target\n",
	"\n",
	"# RF FTW :)\n",
	"rf = ensemble.RandomForestClassifier(n_jobs=4, n_estimators = 20, random_state = 11)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"TRAIN 1.0000 \| TEST 0.7297 \| TIME 7.91m (1-fold)\n"
	]
	}
	],
	"source": [
	"# CROSS VALIDATE AND PRINT TRAIN AND TEST SCORE\n",
	"kf = cross_validation.StratifiedKFold(y, n_folds=3, shuffle=True, random_state=11)\n",
	"trscores, cvscores, times = [], [], []\n",
	"for itr, icv in kf:\n",
	" t = time()\n",
	" trscore = auc(y.iloc[itr], rf.fit(X.iloc[itr], y.iloc[itr]).predict_proba(X.iloc[itr])[:,1])\n",
	" cvscore = auc(y.iloc[icv], rf.predict_proba(X.iloc[icv])[:,1])\n",
	" trscores.append(trscore); cvscores.append(cvscore); times.append(time()-t)\n",
	"print \"TRAIN %.4f \| TEST %.4f \| TIME %.2fm (1-fold)\" % (np.mean(trscores), np.mean(cvscores), np.mean(times)/60)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.9"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}