ocoyawale/Coupon Purchase Prediction - BTB Script.ipynb

## Coupon Purchase Prediction - BTB Script.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Coupon Purchase Prediction - BTB Script.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## Coupon Purchase Prediction - First Script.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Coupon Purchase Prediction - First Script.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## Flavor of Physics - Classification.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn import ensemble, preprocessing, cross_validation\n",
    "from sklearn.metrics import roc_auc_score as auc\n",
    "from time import time\n",
    "from sklearn import svm\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.linear_model import SGDClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.ensemble import ExtraTreesClassifier\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.ensemble import AdaBoostClassifier\n",
    "from sklearn import neighbors, datasets\n",
    "from sklearn import tree\n",
    "from sklearn.datasets import make_hastie_10_2\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "from pandas import Series,DataFrame\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "import seaborn as sns\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.cross_validation import train_test_split\n",
    "from sklearn import metrics\n",
    "import math\n",
    "import statsmodels.api as sm\n",
    "from datetime import datetime\n",
    "import requests\n",
    "from StringIO import StringIO\n",
    "from time import time\n",
    "import sys\n",
    "sys.path.append('C:\\\\Users\\\\Vikrant\\\\xgboost\\\\python-package')\n",
    "import xgboost as xgb\n",
    "from datetime import  datetime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# PREPARE DATA\n",
    "data = pd.read_csv('Physics\\\\Input\\\\training.csv')\n",
    "test = pd.read_csv('Physics\\\\Input\\\\test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Y = data.signal"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>LifeTime</th>\n",
       "      <th>dira</th>\n",
       "      <th>FlightDistance</th>\n",
       "      <th>FlightDistanceError</th>\n",
       "      <th>IP</th>\n",
       "      <th>IPSig</th>\n",
       "      <th>VertexChi2</th>\n",
       "      <th>pt</th>\n",
       "      <th>DOCAone</th>\n",
       "      <th>...</th>\n",
       "      <th>p0_pt</th>\n",
       "      <th>p1_pt</th>\n",
       "      <th>p2_pt</th>\n",
       "      <th>p0_p</th>\n",
       "      <th>p1_p</th>\n",
       "      <th>p2_p</th>\n",
       "      <th>p0_eta</th>\n",
       "      <th>p1_eta</th>\n",
       "      <th>p2_eta</th>\n",
       "      <th>SPDhits</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>14711831</td>\n",
       "      <td>0.001273</td>\n",
       "      <td>0.999816</td>\n",
       "      <td>8.569642</td>\n",
       "      <td>0.310039</td>\n",
       "      <td>0.197893</td>\n",
       "      <td>5.183939</td>\n",
       "      <td>2.286117</td>\n",
       "      <td>687.479248</td>\n",
       "      <td>0.022301</td>\n",
       "      <td>...</td>\n",
       "      <td>508.926514</td>\n",
       "      <td>306.629456</td>\n",
       "      <td>714.222717</td>\n",
       "      <td>30300.734375</td>\n",
       "      <td>5662.158691</td>\n",
       "      <td>7406.368164</td>\n",
       "      <td>4.779700</td>\n",
       "      <td>3.608334</td>\n",
       "      <td>3.029715</td>\n",
       "      <td>705</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>16316387</td>\n",
       "      <td>0.000661</td>\n",
       "      <td>0.999954</td>\n",
       "      <td>7.061379</td>\n",
       "      <td>0.348772</td>\n",
       "      <td>0.067430</td>\n",
       "      <td>6.316281</td>\n",
       "      <td>3.865824</td>\n",
       "      <td>5691.845703</td>\n",
       "      <td>0.002455</td>\n",
       "      <td>...</td>\n",
       "      <td>943.130676</td>\n",
       "      <td>3287.291260</td>\n",
       "      <td>1563.593994</td>\n",
       "      <td>6160.323730</td>\n",
       "      <td>43781.566406</td>\n",
       "      <td>19728.500000</td>\n",
       "      <td>2.563915</td>\n",
       "      <td>3.280884</td>\n",
       "      <td>3.226650</td>\n",
       "      <td>907</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>6771382</td>\n",
       "      <td>0.000889</td>\n",
       "      <td>0.999999</td>\n",
       "      <td>6.502967</td>\n",
       "      <td>0.273914</td>\n",
       "      <td>0.011020</td>\n",
       "      <td>0.865816</td>\n",
       "      <td>1.004631</td>\n",
       "      <td>5429.225586</td>\n",
       "      <td>0.002753</td>\n",
       "      <td>...</td>\n",
       "      <td>2886.055908</td>\n",
       "      <td>2216.936279</td>\n",
       "      <td>598.068359</td>\n",
       "      <td>26387.134766</td>\n",
       "      <td>15534.978516</td>\n",
       "      <td>6504.145020</td>\n",
       "      <td>2.903129</td>\n",
       "      <td>2.634985</td>\n",
       "      <td>3.077517</td>\n",
       "      <td>213</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>686045</td>\n",
       "      <td>0.001295</td>\n",
       "      <td>0.999301</td>\n",
       "      <td>7.838184</td>\n",
       "      <td>0.325697</td>\n",
       "      <td>0.388636</td>\n",
       "      <td>11.391245</td>\n",
       "      <td>10.345356</td>\n",
       "      <td>661.717834</td>\n",
       "      <td>0.008163</td>\n",
       "      <td>...</td>\n",
       "      <td>461.727753</td>\n",
       "      <td>357.967560</td>\n",
       "      <td>1118.084229</td>\n",
       "      <td>5451.857422</td>\n",
       "      <td>10281.403320</td>\n",
       "      <td>23722.742188</td>\n",
       "      <td>3.160085</td>\n",
       "      <td>4.050494</td>\n",
       "      <td>3.747409</td>\n",
       "      <td>275</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>8755882</td>\n",
       "      <td>0.001714</td>\n",
       "      <td>0.999805</td>\n",
       "      <td>7.064559</td>\n",
       "      <td>0.113440</td>\n",
       "      <td>0.154421</td>\n",
       "      <td>13.925871</td>\n",
       "      <td>12.951378</td>\n",
       "      <td>3885.417969</td>\n",
       "      <td>0.066317</td>\n",
       "      <td>...</td>\n",
       "      <td>1773.747925</td>\n",
       "      <td>1643.278198</td>\n",
       "      <td>711.572205</td>\n",
       "      <td>7724.954590</td>\n",
       "      <td>12830.546875</td>\n",
       "      <td>6008.145996</td>\n",
       "      <td>2.151059</td>\n",
       "      <td>2.744157</td>\n",
       "      <td>2.823017</td>\n",
       "      <td>357</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 47 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         id  LifeTime      dira  FlightDistance  FlightDistanceError  \\\n",
       "0  14711831  0.001273  0.999816        8.569642             0.310039   \n",
       "1  16316387  0.000661  0.999954        7.061379             0.348772   \n",
       "2   6771382  0.000889  0.999999        6.502967             0.273914   \n",
       "3    686045  0.001295  0.999301        7.838184             0.325697   \n",
       "4   8755882  0.001714  0.999805        7.064559             0.113440   \n",
       "\n",
       "         IP      IPSig  VertexChi2           pt   DOCAone   ...     \\\n",
       "0  0.197893   5.183939    2.286117   687.479248  0.022301   ...      \n",
       "1  0.067430   6.316281    3.865824  5691.845703  0.002455   ...      \n",
       "2  0.011020   0.865816    1.004631  5429.225586  0.002753   ...      \n",
       "3  0.388636  11.391245   10.345356   661.717834  0.008163   ...      \n",
       "4  0.154421  13.925871   12.951378  3885.417969  0.066317   ...      \n",
       "\n",
       "         p0_pt        p1_pt        p2_pt          p0_p          p1_p  \\\n",
       "0   508.926514   306.629456   714.222717  30300.734375   5662.158691   \n",
       "1   943.130676  3287.291260  1563.593994   6160.323730  43781.566406   \n",
       "2  2886.055908  2216.936279   598.068359  26387.134766  15534.978516   \n",
       "3   461.727753   357.967560  1118.084229   5451.857422  10281.403320   \n",
       "4  1773.747925  1643.278198   711.572205   7724.954590  12830.546875   \n",
       "\n",
       "           p2_p    p0_eta    p1_eta    p2_eta  SPDhits  \n",
       "0   7406.368164  4.779700  3.608334  3.029715      705  \n",
       "1  19728.500000  2.563915  3.280884  3.226650      907  \n",
       "2   6504.145020  2.903129  2.634985  3.077517      213  \n",
       "3  23722.742188  3.160085  4.050494  3.747409      275  \n",
       "4   6008.145996  2.151059  2.744157  2.823017      357  \n",
       "\n",
       "[5 rows x 47 columns]"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# DATA ready\n",
    "X = data.drop('signal', 1)\n",
    "X = X.drop('mass', 1)\n",
    "X = X.drop('production', 1)\n",
    "X = X.drop('min_ANNmuon', 1)\n",
    "X = X.drop('id', 1)\n",
    "X = X.fillna(0)\n",
    "test1 = test.drop('id', 1)\n",
    "#X = preprocessing.scale(X)\n",
    "#test = preprocessing.scale(test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#X = preprocessing.normalize(X)\n",
    "#test1 = preprocessing.normalize(test1)\n",
    "#X = preprocessing.scale(X)\n",
    "#test1 = preprocessing.scale(test1)\n",
    "X = preprocessing.MinMaxScaler().fit_transform(X)\n",
    "test1 = preprocessing.MinMaxScaler().fit_transform(test1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X = X.astype(float)\n",
    "test1 = test1.astype(float)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# RF FTW :)\n",
    "#clf = ensemble.RandomForestClassifier(n_jobs=30, n_estimators = 70, random_state = 25)\n",
    "#clf = ensemble.RandomForestClassifier(n_jobs=10, n_estimators = 50, random_state = 15)\n",
    "#SVC\n",
    "#clf = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)\n",
    "#clf = svm.SVC(C=2.0, cache_size=200, class_weight=1, coef0=0.0, degree=3, gamma=0.0, kernel='rbf', max_iter=-1, probability=True, random_state=4, shrinking=True, tol=0.001, verbose=False)\n",
    "#clf = svm.SVC(probability=True)\n",
    "#Stochastic Gradient Descent\n",
    "#clf = SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, verbose=0, warm_start=False)\n",
    "#clf=SGDClassifier(loss='log',alpha=0.000001,n_iter=100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#clf = DecisionTreeClassifier(max_depth=5, min_samples_split=1, random_state=20)\n",
    "#RandomForestClassifier - \n",
    "#clf = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=1, random_state=50)\n",
    "#clf = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=1, random_state=40)\n",
    "#ExtraTreesClassifier - 0.9808\n",
    "clf = ExtraTreesClassifier(n_estimators=150, max_depth=20, min_samples_split=2, random_state=100)\n",
    "#clf = ExtraTreesClassifier(n_estimators=100, max_depth=10, min_samples_split=1, random_state=200,bootstrap=True)\n",
    "#clf = ExtraTreesClassifier(n_estimators=200, max_depth=30, min_samples_split=4, random_state=200)\n",
    "#Nearest Neighbors Classifier\n",
    "#clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)\n",
    "#clf = neighbors.KNeighborsClassifier()\n",
    "#Decision Tree Classifier\n",
    "#clf = tree.DecisionTreeClassifier()\n",
    "#Adaboost\n",
    "#clf = AdaBoostClassifier(n_estimators=100)\n",
    "#GradientBoostingClassifier# - \n",
    "#clf = GradientBoostingClassifier(n_estimators=400, learning_rate=1.0, max_depth=3, random_state=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# CROSS VALIDATE AND PRINT TRAIN AND TEST SCORE\n",
    "#kf = cross_validation.StratifiedKFold(y, n_folds=5, shuffle=True, random_state=11)\n",
    "#trscores, cvscores, times = [], [], []\n",
    "#for itr, icv in kf:\n",
    "#    t = time()\n",
    "#    trscore = auc(y.iloc[itr], clf.fit(X.iloc[itr], y.iloc[itr]).predict_proba(X.iloc[itr])[:,1])\n",
    "#    cvscore = auc(y.iloc[icv], clf.predict_proba(X.iloc[icv])[:,1])\n",
    "#    trscores.append(trscore); cvscores.append(cvscore); times.append(time()-t)\n",
    "#print \"TRAIN %.4f | TEST %.4f | TIME %.2fm (1-fold)\" % (np.mean(trscores), np.mean(cvscores), np.mean(times)/60)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "X_train, X_test, Y_train, Y_test = train_test_split(X,Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',\n",
       "           max_depth=20, max_features='auto', max_leaf_nodes=None,\n",
       "           min_samples_leaf=1, min_samples_split=2,\n",
       "           min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=1,\n",
       "           oob_score=False, random_state=100, verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf.fit(X_train,Y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.852270708745\n"
     ]
    }
   ],
   "source": [
    "predicted = clf.predict(X_test)\n",
    "expected = Y_test\n",
    "print metrics.accuracy_score(expected,predicted)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# MAKING SUBMISSION\n",
    "submission = pd.DataFrame(clf.fit(X,Y).predict_proba(test1)[:,1], index=test.id, columns=['prediction'])\n",
    "submission.index.name = 'id'\n",
    "submission.to_csv('Physics\\\\Output\\\\Results.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "submission = pd.DataFrame(clf.fit(X,Y).predict_proba(test1)[:,1], index=test.id, columns=['prediction'])\n",
    "submission.index.name = 'id'\n",
    "submission.to_csv('Physics\\\\Output\\\\Results.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "xgb_params = {\"objective\": \"binary:logistic\", \"max_depth\": 15, \"silent\": 1}\n",
    "num_rounds = 400"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "dtrain = xgb.DMatrix(X, label=Y)\n",
    "dtest = xgb.DMatrix(test1)\n",
    "gbdt = xgb.train(xgb_params, dtrain, num_rounds)\n",
    "preds = gbdt.predict(dtest)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "submission = pd.DataFrame({\"ID\": test.id, \"prediction\": preds})\n",
    "submission = submission.set_index('ID')\n",
    "submission.to_csv('Physics\\\\Output\\\\Results1.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}

## Flavor of Physics - Mix of models.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Flavor of Physics - Mix of models.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## Springleaf - Experiments with Classification.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Springleaf - Experiments with Classification.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## Springleaf - Experiments with Random Forest.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Springleaf - Experiments with Random Forest.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## SpringLeaf -Kaggle 18AUG15 (1).ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              SpringLeaf -Kaggle 18AUG15 (1).ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## SpringLeaf -Kaggle 18AUG15.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              SpringLeaf -Kaggle 18AUG15.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## Springleaf with xgb.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Springleaf with xgb.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## Springleaf with xgb1.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Springleaf with xgb1.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 47,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import numpy as np\n",
	"from sklearn import ensemble, preprocessing, cross_validation\n",
	"from sklearn.metrics import roc_auc_score as auc\n",
	"from time import time\n",
	"from sklearn import svm\n",
	"from sklearn.svm import SVC\n",
	"from sklearn.linear_model import SGDClassifier\n",
	"from sklearn.ensemble import RandomForestClassifier\n",
	"from sklearn.ensemble import ExtraTreesClassifier\n",
	"from sklearn.tree import DecisionTreeClassifier\n",
	"from sklearn.ensemble import AdaBoostClassifier\n",
	"from sklearn import neighbors, datasets\n",
	"from sklearn import tree\n",
	"from sklearn.datasets import make_hastie_10_2\n",
	"from sklearn.ensemble import GradientBoostingClassifier\n",
	"from pandas import Series,DataFrame\n",
	"import matplotlib.pyplot as plt\n",
	"%matplotlib inline\n",
	"import seaborn as sns\n",
	"from sklearn.linear_model import LogisticRegression\n",
	"from sklearn.cross_validation import train_test_split\n",
	"from sklearn import metrics\n",
	"import math\n",
	"import statsmodels.api as sm\n",
	"from datetime import datetime\n",
	"import requests\n",
	"from StringIO import StringIO\n",
	"from time import time\n",
	"import sys\n",
	"sys.path.append('C:\\\\Users\\\\Vikrant\\\\xgboost\\\\python-package')\n",
	"import xgboost as xgb\n",
	"from datetime import datetime"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 48,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# PREPARE DATA\n",
	"data = pd.read_csv('Physics\\\\Input\\\\training.csv')\n",
	"test = pd.read_csv('Physics\\\\Input\\\\test.csv')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 49,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"Y = data.signal"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 50,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>id</th>\n",
	" <th>LifeTime</th>\n",
	" <th>dira</th>\n",
	" <th>FlightDistance</th>\n",
	" <th>FlightDistanceError</th>\n",
	" <th>IP</th>\n",
	" <th>IPSig</th>\n",
	" <th>VertexChi2</th>\n",
	" <th>pt</th>\n",
	" <th>DOCAone</th>\n",
	" <th>...</th>\n",
	" <th>p0_pt</th>\n",
	" <th>p1_pt</th>\n",
	" <th>p2_pt</th>\n",
	" <th>p0_p</th>\n",
	" <th>p1_p</th>\n",
	" <th>p2_p</th>\n",
	" <th>p0_eta</th>\n",
	" <th>p1_eta</th>\n",
	" <th>p2_eta</th>\n",
	" <th>SPDhits</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>14711831</td>\n",
	" <td>0.001273</td>\n",
	" <td>0.999816</td>\n",
	" <td>8.569642</td>\n",
	" <td>0.310039</td>\n",
	" <td>0.197893</td>\n",
	" <td>5.183939</td>\n",
	" <td>2.286117</td>\n",
	" <td>687.479248</td>\n",
	" <td>0.022301</td>\n",
	" <td>...</td>\n",
	" <td>508.926514</td>\n",
	" <td>306.629456</td>\n",
	" <td>714.222717</td>\n",
	" <td>30300.734375</td>\n",
	" <td>5662.158691</td>\n",
	" <td>7406.368164</td>\n",
	" <td>4.779700</td>\n",
	" <td>3.608334</td>\n",
	" <td>3.029715</td>\n",
	" <td>705</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>16316387</td>\n",
	" <td>0.000661</td>\n",
	" <td>0.999954</td>\n",
	" <td>7.061379</td>\n",
	" <td>0.348772</td>\n",
	" <td>0.067430</td>\n",
	" <td>6.316281</td>\n",
	" <td>3.865824</td>\n",
	" <td>5691.845703</td>\n",
	" <td>0.002455</td>\n",
	" <td>...</td>\n",
	" <td>943.130676</td>\n",
	" <td>3287.291260</td>\n",
	" <td>1563.593994</td>\n",
	" <td>6160.323730</td>\n",
	" <td>43781.566406</td>\n",
	" <td>19728.500000</td>\n",
	" <td>2.563915</td>\n",
	" <td>3.280884</td>\n",
	" <td>3.226650</td>\n",
	" <td>907</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>6771382</td>\n",
	" <td>0.000889</td>\n",
	" <td>0.999999</td>\n",
	" <td>6.502967</td>\n",
	" <td>0.273914</td>\n",
	" <td>0.011020</td>\n",
	" <td>0.865816</td>\n",
	" <td>1.004631</td>\n",
	" <td>5429.225586</td>\n",
	" <td>0.002753</td>\n",
	" <td>...</td>\n",
	" <td>2886.055908</td>\n",
	" <td>2216.936279</td>\n",
	" <td>598.068359</td>\n",
	" <td>26387.134766</td>\n",
	" <td>15534.978516</td>\n",
	" <td>6504.145020</td>\n",
	" <td>2.903129</td>\n",
	" <td>2.634985</td>\n",
	" <td>3.077517</td>\n",
	" <td>213</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>686045</td>\n",
	" <td>0.001295</td>\n",
	" <td>0.999301</td>\n",
	" <td>7.838184</td>\n",
	" <td>0.325697</td>\n",
	" <td>0.388636</td>\n",
	" <td>11.391245</td>\n",
	" <td>10.345356</td>\n",
	" <td>661.717834</td>\n",
	" <td>0.008163</td>\n",
	" <td>...</td>\n",
	" <td>461.727753</td>\n",
	" <td>357.967560</td>\n",
	" <td>1118.084229</td>\n",
	" <td>5451.857422</td>\n",
	" <td>10281.403320</td>\n",
	" <td>23722.742188</td>\n",
	" <td>3.160085</td>\n",
	" <td>4.050494</td>\n",
	" <td>3.747409</td>\n",
	" <td>275</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>8755882</td>\n",
	" <td>0.001714</td>\n",
	" <td>0.999805</td>\n",
	" <td>7.064559</td>\n",
	" <td>0.113440</td>\n",
	" <td>0.154421</td>\n",
	" <td>13.925871</td>\n",
	" <td>12.951378</td>\n",
	" <td>3885.417969</td>\n",
	" <td>0.066317</td>\n",
	" <td>...</td>\n",
	" <td>1773.747925</td>\n",
	" <td>1643.278198</td>\n",
	" <td>711.572205</td>\n",
	" <td>7724.954590</td>\n",
	" <td>12830.546875</td>\n",
	" <td>6008.145996</td>\n",
	" <td>2.151059</td>\n",
	" <td>2.744157</td>\n",
	" <td>2.823017</td>\n",
	" <td>357</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>5 rows × 47 columns</p>\n",
	"</div>"
	],
	"text/plain": [
	" id LifeTime dira FlightDistance FlightDistanceError \\\n",
	"0 14711831 0.001273 0.999816 8.569642 0.310039 \n",
	"1 16316387 0.000661 0.999954 7.061379 0.348772 \n",
	"2 6771382 0.000889 0.999999 6.502967 0.273914 \n",
	"3 686045 0.001295 0.999301 7.838184 0.325697 \n",
	"4 8755882 0.001714 0.999805 7.064559 0.113440 \n",
	"\n",
	" IP IPSig VertexChi2 pt DOCAone ... \\\n",
	"0 0.197893 5.183939 2.286117 687.479248 0.022301 ... \n",
	"1 0.067430 6.316281 3.865824 5691.845703 0.002455 ... \n",
	"2 0.011020 0.865816 1.004631 5429.225586 0.002753 ... \n",
	"3 0.388636 11.391245 10.345356 661.717834 0.008163 ... \n",
	"4 0.154421 13.925871 12.951378 3885.417969 0.066317 ... \n",
	"\n",
	" p0_pt p1_pt p2_pt p0_p p1_p \\\n",
	"0 508.926514 306.629456 714.222717 30300.734375 5662.158691 \n",
	"1 943.130676 3287.291260 1563.593994 6160.323730 43781.566406 \n",
	"2 2886.055908 2216.936279 598.068359 26387.134766 15534.978516 \n",
	"3 461.727753 357.967560 1118.084229 5451.857422 10281.403320 \n",
	"4 1773.747925 1643.278198 711.572205 7724.954590 12830.546875 \n",
	"\n",
	" p2_p p0_eta p1_eta p2_eta SPDhits \n",
	"0 7406.368164 4.779700 3.608334 3.029715 705 \n",
	"1 19728.500000 2.563915 3.280884 3.226650 907 \n",
	"2 6504.145020 2.903129 2.634985 3.077517 213 \n",
	"3 23722.742188 3.160085 4.050494 3.747409 275 \n",
	"4 6008.145996 2.151059 2.744157 2.823017 357 \n",
	"\n",
	"[5 rows x 47 columns]"
	]
	},
	"execution_count": 50,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"test.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 51,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# DATA ready\n",
	"X = data.drop('signal', 1)\n",
	"X = X.drop('mass', 1)\n",
	"X = X.drop('production', 1)\n",
	"X = X.drop('min_ANNmuon', 1)\n",
	"X = X.drop('id', 1)\n",
	"X = X.fillna(0)\n",
	"test1 = test.drop('id', 1)\n",
	"#X = preprocessing.scale(X)\n",
	"#test = preprocessing.scale(test)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 73,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#X = preprocessing.normalize(X)\n",
	"#test1 = preprocessing.normalize(test1)\n",
	"#X = preprocessing.scale(X)\n",
	"#test1 = preprocessing.scale(test1)\n",
	"X = preprocessing.MinMaxScaler().fit_transform(X)\n",
	"test1 = preprocessing.MinMaxScaler().fit_transform(test1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 74,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"X = X.astype(float)\n",
	"test1 = test1.astype(float)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 75,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# RF FTW :)\n",
	"#clf = ensemble.RandomForestClassifier(n_jobs=30, n_estimators = 70, random_state = 25)\n",
	"#clf = ensemble.RandomForestClassifier(n_jobs=10, n_estimators = 50, random_state = 15)\n",
	"#SVC\n",
	"#clf = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)\n",
	"#clf = svm.SVC(C=2.0, cache_size=200, class_weight=1, coef0=0.0, degree=3, gamma=0.0, kernel='rbf', max_iter=-1, probability=True, random_state=4, shrinking=True, tol=0.001, verbose=False)\n",
	"#clf = svm.SVC(probability=True)\n",
	"#Stochastic Gradient Descent\n",
	"#clf = SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, verbose=0, warm_start=False)\n",
	"#clf=SGDClassifier(loss='log',alpha=0.000001,n_iter=100)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 76,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#clf = DecisionTreeClassifier(max_depth=5, min_samples_split=1, random_state=20)\n",
	"#RandomForestClassifier - \n",
	"#clf = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=1, random_state=50)\n",
	"#clf = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=1, random_state=40)\n",
	"#ExtraTreesClassifier - 0.9808\n",
	"clf = ExtraTreesClassifier(n_estimators=150, max_depth=20, min_samples_split=2, random_state=100)\n",
	"#clf = ExtraTreesClassifier(n_estimators=100, max_depth=10, min_samples_split=1, random_state=200,bootstrap=True)\n",
	"#clf = ExtraTreesClassifier(n_estimators=200, max_depth=30, min_samples_split=4, random_state=200)\n",
	"#Nearest Neighbors Classifier\n",
	"#clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)\n",
	"#clf = neighbors.KNeighborsClassifier()\n",
	"#Decision Tree Classifier\n",
	"#clf = tree.DecisionTreeClassifier()\n",
	"#Adaboost\n",
	"#clf = AdaBoostClassifier(n_estimators=100)\n",
	"#GradientBoostingClassifier# - \n",
	"#clf = GradientBoostingClassifier(n_estimators=400, learning_rate=1.0, max_depth=3, random_state=200)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 77,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# CROSS VALIDATE AND PRINT TRAIN AND TEST SCORE\n",
	"#kf = cross_validation.StratifiedKFold(y, n_folds=5, shuffle=True, random_state=11)\n",
	"#trscores, cvscores, times = [], [], []\n",
	"#for itr, icv in kf:\n",
	"# t = time()\n",
	"# trscore = auc(y.iloc[itr], clf.fit(X.iloc[itr], y.iloc[itr]).predict_proba(X.iloc[itr])[:,1])\n",
	"# cvscore = auc(y.iloc[icv], clf.predict_proba(X.iloc[icv])[:,1])\n",
	"# trscores.append(trscore); cvscores.append(cvscore); times.append(time()-t)\n",
	"#print \"TRAIN %.4f \| TEST %.4f \| TIME %.2fm (1-fold)\" % (np.mean(trscores), np.mean(cvscores), np.mean(times)/60)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 78,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"X_train, X_test, Y_train, Y_test = train_test_split(X,Y)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 79,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',\n",
	" max_depth=20, max_features='auto', max_leaf_nodes=None,\n",
	" min_samples_leaf=1, min_samples_split=2,\n",
	" min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=1,\n",
	" oob_score=False, random_state=100, verbose=0, warm_start=False)"
	]
	},
	"execution_count": 79,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"clf.fit(X_train,Y_train)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 80,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"0.852270708745\n"
	]
	}
	],
	"source": [
	"predicted = clf.predict(X_test)\n",
	"expected = Y_test\n",
	"print metrics.accuracy_score(expected,predicted)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 81,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# MAKING SUBMISSION\n",
	"submission = pd.DataFrame(clf.fit(X,Y).predict_proba(test1)[:,1], index=test.id, columns=['prediction'])\n",
	"submission.index.name = 'id'\n",
	"submission.to_csv('Physics\\\\Output\\\\Results.csv')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"submission = pd.DataFrame(clf.fit(X,Y).predict_proba(test1)[:,1], index=test.id, columns=['prediction'])\n",
	"submission.index.name = 'id'\n",
	"submission.to_csv('Physics\\\\Output\\\\Results.csv')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 96,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"xgb_params = {\"objective\": \"binary:logistic\", \"max_depth\": 15, \"silent\": 1}\n",
	"num_rounds = 400"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 97,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"dtrain = xgb.DMatrix(X, label=Y)\n",
	"dtest = xgb.DMatrix(test1)\n",
	"gbdt = xgb.train(xgb_params, dtrain, num_rounds)\n",
	"preds = gbdt.predict(dtest)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 98,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"submission = pd.DataFrame({\"ID\": test.id, \"prediction\": preds})\n",
	"submission = submission.set_index('ID')\n",
	"submission.to_csv('Physics\\\\Output\\\\Results1.csv')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.10"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}