ocoyawale/Coupon Purchase Prediction - BTB Script.ipynb

## Coupon Purchase Prediction - BTB Script.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Coupon Purchase Prediction - BTB Script.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## Coupon Purchase Prediction - First Script.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Coupon Purchase Prediction - First Script.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## Flavor of Physics - Classification.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Flavor of Physics - Classification.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## Flavor of Physics - Mix of models.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn import ensemble, preprocessing, cross_validation\n",
    "from sklearn.metrics import roc_auc_score as auc\n",
    "from time import time\n",
    "from sklearn import svm\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.linear_model import SGDClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.ensemble import ExtraTreesClassifier\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.ensemble import AdaBoostClassifier\n",
    "from sklearn import neighbors, datasets\n",
    "from sklearn import tree\n",
    "from sklearn.datasets import make_hastie_10_2\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "from pandas import Series,DataFrame\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "import seaborn as sns\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.cross_validation import train_test_split\n",
    "from sklearn import metrics\n",
    "import math\n",
    "import statsmodels.api as sm\n",
    "from datetime import datetime\n",
    "import requests\n",
    "from StringIO import StringIO\n",
    "from time import time\n",
    "import sys\n",
    "sys.path.append('C:\\\\Users\\\\Vikrant\\\\xgboost\\\\python-package')\n",
    "import xgboost as xgb\n",
    "from datetime import  datetime\n",
    "from sklearn.preprocessing import StandardScaler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def get_training_data():\n",
    "    filter_out = ['id', 'min_ANNmuon', 'production', 'mass', 'signal', 'SPDhits', 'IP', 'IPSig', 'isolationc']\n",
    "    f = open('Physics\\\\Input\\\\training.csv')\n",
    "    data = []\n",
    "    y = []\n",
    "    ids = []\n",
    "    for i, l in enumerate(f):\n",
    "        if i == 0:\n",
    "            labels = l.rstrip().split(',')\n",
    "            label_indices = dict((l, i) for i, l in enumerate(labels))\n",
    "            continue\n",
    "\n",
    "        values = l.rstrip().split(',')\n",
    "        filtered = []\n",
    "        for v, l in zip(values, labels):\n",
    "            if l not in filter_out:\n",
    "                filtered.append(float(v))\n",
    "\n",
    "        label = values[label_indices['signal']]\n",
    "        ID = values[0]\n",
    "\n",
    "        data.append(filtered)\n",
    "        y.append(float(label))\n",
    "        ids.append(ID)\n",
    "    return ids, np.array(data), np.array(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_test_data():\n",
    "    filter_out = ['id', 'min_ANNmuon', 'production', 'mass', 'signal', 'SPDhits', 'IP', 'IPSig', 'isolationc']\n",
    "    f = open('Physics\\\\Input\\\\test.csv')\n",
    "    data = []\n",
    "    ids = []\n",
    "    for i, l in enumerate(f):\n",
    "        if i == 0:\n",
    "            labels = l.rstrip().split(',')\n",
    "            continue\n",
    "\n",
    "        values = l.rstrip().split(',')\n",
    "        filtered = []\n",
    "        for v, l in zip(values, labels):\n",
    "            if l not in filter_out:\n",
    "                filtered.append(float(v))\n",
    "\n",
    "        ID = values[0]\n",
    "        data.append(filtered)\n",
    "        ids.append(ID)\n",
    "    return ids, np.array(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def preprocess_data(X, scaler=None):\n",
    "    if not scaler:\n",
    "        scaler = StandardScaler()\n",
    "        scaler.fit(X)\n",
    "    X = scaler.transform(X)\n",
    "    return X, scaler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('Data shape:', (67553L, 42L))\n"
     ]
    }
   ],
   "source": [
    "# get training data\n",
    "ids, X, y = get_training_data()\n",
    "print('Data shape:', X.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('Signal ratio:', 0.6169082054090862)\n"
     ]
    }
   ],
   "source": [
    "# shuffle the data\n",
    "np.random.seed(369)\n",
    "np.random.shuffle(X)\n",
    "np.random.seed(369)\n",
    "np.random.shuffle(y)\n",
    "\n",
    "print('Signal ratio:', np.sum(y) / y.shape[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# preprocess the data\n",
    "X, scaler = preprocess_data(X)\n",
    "y = preprocess_data(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('Train on:', 1L)\n",
      "('Eval on:', 67552L)\n"
     ]
    }
   ],
   "source": [
    "# split into training / evaluation data\n",
    "nb_train_sample = int(len(y) * 0.97)\n",
    "X_train = X[:nb_train_sample]\n",
    "X_eval = X[nb_train_sample:]\n",
    "y_train = y[:nb_train_sample]\n",
    "y_eval = y[nb_train_sample:]\n",
    "\n",
    "print('Train on:', X_train.shape[0])\n",
    "print('Eval on:', X_eval.shape[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Load the training/test data using pandas\n"
     ]
    }
   ],
   "source": [
    "print(\"Load the training/test data using pandas\")\n",
    "train = pd.read_csv(\"Physics\\\\Input\\\\training.csv\")\n",
    "test  = pd.read_csv(\"Physics\\\\Input\\\\test.csv\")\n",
    "train = train.drop(['IPSig'],axis=1)\n",
    "test = test.drop(['IPSig'],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Eliminate SPDhits, which makes the agreement check fail\n",
      "Train a UGradientBoostingClassifier\n"
     ]
    }
   ],
   "source": [
    "print(\"Eliminate SPDhits, which makes the agreement check fail\")\n",
    "features = list(train.columns[1:-5])\n",
    "print(\"Train a UGradientBoostingClassifier\")\n",
    "#loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0)\n",
    "clf = GradientBoostingClassifier(n_estimators=300, subsample=0.1, \n",
    "                                  max_depth=6, min_samples_leaf=10, random_state=50)\n",
    "clf.fit(train[features], train['signal'])\n",
    "fb_preds = clf.predict_proba(test[features])[:,1]\n",
    "\n",
    "#loss = KnnFlatnessLossFunction(['mass'], uniform_label=0)\n",
    "clf = ExtraTreesClassifier(n_estimators=200,  \n",
    "                                  max_depth=5, random_state=369)\n",
    "clf.fit(train[features], train['signal'])\n",
    "fb2_preds = clf.predict_proba(test[features])[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train a Random Forest model\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',\n",
       "            max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
       "            min_samples_leaf=1, min_samples_split=2,\n",
       "            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,\n",
       "            oob_score=False, random_state=1, verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(\"Train a Random Forest model\")\n",
    "rf = RandomForestClassifier(n_estimators=500, n_jobs=-1, criterion=\"entropy\", random_state=1)\n",
    "rf.fit(train[features], train[\"signal\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train a XGBoost model\n"
     ]
    }
   ],
   "source": [
    "print(\"Train a XGBoost model\")\n",
    "params = {\"objective\": \"binary:logistic\",\n",
    "          \"eta\": 0.2,\n",
    "          \"max_depth\": 4,\n",
    "          \"min_child_weight\": 1,\n",
    "          \"silent\": 1,\n",
    "          \"colsample_bytree\": 0.7,\n",
    "          \"seed\": 1}\n",
    "num_trees=240\n",
    "gbm = xgb.train(params, xgb.DMatrix(train[features], train[\"signal\"]), num_trees)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Make predictions on the test set\n"
     ]
    }
   ],
   "source": [
    "print(\"Make predictions on the test set\")\n",
    "test_probs = (0.25*rf.predict_proba(test[features])[:,1]) + (0.25*gbm.predict(xgb.DMatrix(test[features])))+ + (0.25*fb_preds)+ (0.25*fb2_preds)\n",
    "submission = pd.DataFrame({\"id\": test[\"id\"], \"prediction\": test_probs})\n",
    "submission.to_csv(\"Physics\\\\Output\\\\Mixedmodel.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# RF FTW :)\n",
    "clf1 = ensemble.RandomForestClassifier(n_jobs=400, n_estimators = 100, random_state = 45)\n",
    "#clf = ensemble.RandomForestClassifier(n_jobs=10, n_estimators = 50, random_state = 15)\n",
    "#SVC\n",
    "#clf = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)\n",
    "#clf = svm.SVC(C=2.0, cache_size=200, class_weight=1, coef0=0.0, degree=3, gamma=0.0, kernel='rbf', max_iter=-1, probability=True, random_state=4, shrinking=True, tol=0.001, verbose=False)\n",
    "#clf = svm.SVC(probability=True)\n",
    "#Stochastic Gradient Descent\n",
    "#clf = SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, verbose=0, warm_start=False)\n",
    "#clf=SGDClassifier(loss='log',alpha=0.000001,n_iter=100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#clf = DecisionTreeClassifier(max_depth=5, min_samples_split=1, random_state=20)\n",
    "#RandomForestClassifier - \n",
    "clf2 = RandomForestClassifier(n_estimators=450, max_depth=10, min_samples_split=1, random_state=60)\n",
    "#clf = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=1, random_state=40)\n",
    "#ExtraTreesClassifier - 0.9808\n",
    "clf3 = ExtraTreesClassifier(n_estimators=350, max_depth=20, min_samples_split=2, random_state=160)\n",
    "#clf = ExtraTreesClassifier(n_estimators=100, max_depth=10, min_samples_split=1, random_state=200,bootstrap=True)\n",
    "#clf = ExtraTreesClassifier(n_estimators=200, max_depth=30, min_samples_split=4, random_state=200)\n",
    "#Nearest Neighbors Classifier\n",
    "#clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)\n",
    "#clf = neighbors.KNeighborsClassifier()\n",
    "#Decision Tree Classifier\n",
    "#clf = tree.DecisionTreeClassifier()\n",
    "#Adaboost\n",
    "#clf3 = AdaBoostClassifier(n_estimators=100)\n",
    "#GradientBoostingClassifier# - \n",
    "clf4 = GradientBoostingClassifier(n_estimators=600, learning_rate=1.0, max_depth=3, random_state=650)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X = train[features]\n",
    "Y = train['signal']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train, X_test, Y_train, Y_test = train_test_split(X,Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GradientBoostingClassifier(init=None, learning_rate=1.0, loss='deviance',\n",
       "              max_depth=3, max_features=None, max_leaf_nodes=None,\n",
       "              min_samples_leaf=1, min_samples_split=2,\n",
       "              min_weight_fraction_leaf=0.0, n_estimators=600,\n",
       "              random_state=650, subsample=1.0, verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 115,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf1.fit(X_train,Y_train)\n",
    "clf2.fit(X_train,Y_train)\n",
    "clf3.fit(X_train,Y_train)\n",
    "clf4.fit(X_train,Y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.858724613654\n",
      "0.849783883001\n",
      "0.841790514536\n",
      "0.848185209308\n"
     ]
    }
   ],
   "source": [
    "predicted = clf1.predict(X_test)\n",
    "expected = Y_test\n",
    "print metrics.accuracy_score(expected,predicted)\n",
    "predicted = clf2.predict(X_test)\n",
    "expected = Y_test\n",
    "print metrics.accuracy_score(expected,predicted)\n",
    "predicted = clf3.predict(X_test)\n",
    "expected = Y_test\n",
    "print metrics.accuracy_score(expected,predicted)\n",
    "predicted = clf4.predict(X_test)\n",
    "expected = Y_test\n",
    "print metrics.accuracy_score(expected,predicted)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Make predictions on the test set\n"
     ]
    }
   ],
   "source": [
    "print(\"Make predictions on the test set\")\n",
    "test_probs = (0.15*clf1.predict_proba(test[features])[:,1]) + (0.15*clf2.predict_proba(test[features])[:,1]) + (0.50*clf3.predict_proba(test[features])[:,1]) + (0.20*clf4.predict_proba(test[features])[:,1])\n",
    "submission = pd.DataFrame({\"id\": test[\"id\"], \"prediction\": test_probs})\n",
    "submission.to_csv(\"Physics\\\\Output\\\\Mixedmodel.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}

## Springleaf - Experiments with Classification.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Springleaf - Experiments with Classification.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## Springleaf - Experiments with Random Forest.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Springleaf - Experiments with Random Forest.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## SpringLeaf -Kaggle 18AUG15 (1).ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              SpringLeaf -Kaggle 18AUG15 (1).ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## SpringLeaf -Kaggle 18AUG15.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              SpringLeaf -Kaggle 18AUG15.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## Springleaf with xgb.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Springleaf with xgb.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## Springleaf with xgb1.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Springleaf with xgb1.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 81,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import numpy as np\n",
	"from sklearn import ensemble, preprocessing, cross_validation\n",
	"from sklearn.metrics import roc_auc_score as auc\n",
	"from time import time\n",
	"from sklearn import svm\n",
	"from sklearn.svm import SVC\n",
	"from sklearn.linear_model import SGDClassifier\n",
	"from sklearn.ensemble import RandomForestClassifier\n",
	"from sklearn.ensemble import ExtraTreesClassifier\n",
	"from sklearn.tree import DecisionTreeClassifier\n",
	"from sklearn.ensemble import AdaBoostClassifier\n",
	"from sklearn import neighbors, datasets\n",
	"from sklearn import tree\n",
	"from sklearn.datasets import make_hastie_10_2\n",
	"from sklearn.ensemble import GradientBoostingClassifier\n",
	"from pandas import Series,DataFrame\n",
	"import matplotlib.pyplot as plt\n",
	"%matplotlib inline\n",
	"import seaborn as sns\n",
	"from sklearn.linear_model import LogisticRegression\n",
	"from sklearn.cross_validation import train_test_split\n",
	"from sklearn import metrics\n",
	"import math\n",
	"import statsmodels.api as sm\n",
	"from datetime import datetime\n",
	"import requests\n",
	"from StringIO import StringIO\n",
	"from time import time\n",
	"import sys\n",
	"sys.path.append('C:\\\\Users\\\\Vikrant\\\\xgboost\\\\python-package')\n",
	"import xgboost as xgb\n",
	"from datetime import datetime\n",
	"from sklearn.preprocessing import StandardScaler"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 82,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def get_training_data():\n",
	" filter_out = ['id', 'min_ANNmuon', 'production', 'mass', 'signal', 'SPDhits', 'IP', 'IPSig', 'isolationc']\n",
	" f = open('Physics\\\\Input\\\\training.csv')\n",
	" data = []\n",
	" y = []\n",
	" ids = []\n",
	" for i, l in enumerate(f):\n",
	" if i == 0:\n",
	" labels = l.rstrip().split(',')\n",
	" label_indices = dict((l, i) for i, l in enumerate(labels))\n",
	" continue\n",
	"\n",
	" values = l.rstrip().split(',')\n",
	" filtered = []\n",
	" for v, l in zip(values, labels):\n",
	" if l not in filter_out:\n",
	" filtered.append(float(v))\n",
	"\n",
	" label = values[label_indices['signal']]\n",
	" ID = values[0]\n",
	"\n",
	" data.append(filtered)\n",
	" y.append(float(label))\n",
	" ids.append(ID)\n",
	" return ids, np.array(data), np.array(y)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 83,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def get_test_data():\n",
	" filter_out = ['id', 'min_ANNmuon', 'production', 'mass', 'signal', 'SPDhits', 'IP', 'IPSig', 'isolationc']\n",
	" f = open('Physics\\\\Input\\\\test.csv')\n",
	" data = []\n",
	" ids = []\n",
	" for i, l in enumerate(f):\n",
	" if i == 0:\n",
	" labels = l.rstrip().split(',')\n",
	" continue\n",
	"\n",
	" values = l.rstrip().split(',')\n",
	" filtered = []\n",
	" for v, l in zip(values, labels):\n",
	" if l not in filter_out:\n",
	" filtered.append(float(v))\n",
	"\n",
	" ID = values[0]\n",
	" data.append(filtered)\n",
	" ids.append(ID)\n",
	" return ids, np.array(data)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 84,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def preprocess_data(X, scaler=None):\n",
	" if not scaler:\n",
	" scaler = StandardScaler()\n",
	" scaler.fit(X)\n",
	" X = scaler.transform(X)\n",
	" return X, scaler"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 85,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"('Data shape:', (67553L, 42L))\n"
	]
	}
	],
	"source": [
	"# get training data\n",
	"ids, X, y = get_training_data()\n",
	"print('Data shape:', X.shape)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 86,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"('Signal ratio:', 0.6169082054090862)\n"
	]
	}
	],
	"source": [
	"# shuffle the data\n",
	"np.random.seed(369)\n",
	"np.random.shuffle(X)\n",
	"np.random.seed(369)\n",
	"np.random.shuffle(y)\n",
	"\n",
	"print('Signal ratio:', np.sum(y) / y.shape[0])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 87,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# preprocess the data\n",
	"X, scaler = preprocess_data(X)\n",
	"y = preprocess_data(y)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 88,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"('Train on:', 1L)\n",
	"('Eval on:', 67552L)\n"
	]
	}
	],
	"source": [
	"# split into training / evaluation data\n",
	"nb_train_sample = int(len(y) * 0.97)\n",
	"X_train = X[:nb_train_sample]\n",
	"X_eval = X[nb_train_sample:]\n",
	"y_train = y[:nb_train_sample]\n",
	"y_eval = y[nb_train_sample:]\n",
	"\n",
	"print('Train on:', X_train.shape[0])\n",
	"print('Eval on:', X_eval.shape[0])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 89,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Load the training/test data using pandas\n"
	]
	}
	],
	"source": [
	"print(\"Load the training/test data using pandas\")\n",
	"train = pd.read_csv(\"Physics\\\\Input\\\\training.csv\")\n",
	"test = pd.read_csv(\"Physics\\\\Input\\\\test.csv\")\n",
	"train = train.drop(['IPSig'],axis=1)\n",
	"test = test.drop(['IPSig'],axis=1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 90,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Eliminate SPDhits, which makes the agreement check fail\n",
	"Train a UGradientBoostingClassifier\n"
	]
	}
	],
	"source": [
	"print(\"Eliminate SPDhits, which makes the agreement check fail\")\n",
	"features = list(train.columns[1:-5])\n",
	"print(\"Train a UGradientBoostingClassifier\")\n",
	"#loss = BinFlatnessLossFunction(['mass'], n_bins=15, uniform_label=0)\n",
	"clf = GradientBoostingClassifier(n_estimators=300, subsample=0.1, \n",
	" max_depth=6, min_samples_leaf=10, random_state=50)\n",
	"clf.fit(train[features], train['signal'])\n",
	"fb_preds = clf.predict_proba(test[features])[:,1]\n",
	"\n",
	"#loss = KnnFlatnessLossFunction(['mass'], uniform_label=0)\n",
	"clf = ExtraTreesClassifier(n_estimators=200, \n",
	" max_depth=5, random_state=369)\n",
	"clf.fit(train[features], train['signal'])\n",
	"fb2_preds = clf.predict_proba(test[features])[:,1]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 91,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Train a Random Forest model\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',\n",
	" max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
	" min_samples_leaf=1, min_samples_split=2,\n",
	" min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,\n",
	" oob_score=False, random_state=1, verbose=0, warm_start=False)"
	]
	},
	"execution_count": 91,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"print(\"Train a Random Forest model\")\n",
	"rf = RandomForestClassifier(n_estimators=500, n_jobs=-1, criterion=\"entropy\", random_state=1)\n",
	"rf.fit(train[features], train[\"signal\"])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 92,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Train a XGBoost model\n"
	]
	}
	],
	"source": [
	"print(\"Train a XGBoost model\")\n",
	"params = {\"objective\": \"binary:logistic\",\n",
	" \"eta\": 0.2,\n",
	" \"max_depth\": 4,\n",
	" \"min_child_weight\": 1,\n",
	" \"silent\": 1,\n",
	" \"colsample_bytree\": 0.7,\n",
	" \"seed\": 1}\n",
	"num_trees=240\n",
	"gbm = xgb.train(params, xgb.DMatrix(train[features], train[\"signal\"]), num_trees)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 95,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Make predictions on the test set\n"
	]
	}
	],
	"source": [
	"print(\"Make predictions on the test set\")\n",
	"test_probs = (0.25rf.predict_proba(test[features])[:,1]) + (0.25gbm.predict(xgb.DMatrix(test[features])))+ + (0.25fb_preds)+ (0.25fb2_preds)\n",
	"submission = pd.DataFrame({\"id\": test[\"id\"], \"prediction\": test_probs})\n",
	"submission.to_csv(\"Physics\\\\Output\\\\Mixedmodel.csv\", index=False)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 112,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# RF FTW :)\n",
	"clf1 = ensemble.RandomForestClassifier(n_jobs=400, n_estimators = 100, random_state = 45)\n",
	"#clf = ensemble.RandomForestClassifier(n_jobs=10, n_estimators = 50, random_state = 15)\n",
	"#SVC\n",
	"#clf = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)\n",
	"#clf = svm.SVC(C=2.0, cache_size=200, class_weight=1, coef0=0.0, degree=3, gamma=0.0, kernel='rbf', max_iter=-1, probability=True, random_state=4, shrinking=True, tol=0.001, verbose=False)\n",
	"#clf = svm.SVC(probability=True)\n",
	"#Stochastic Gradient Descent\n",
	"#clf = SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, verbose=0, warm_start=False)\n",
	"#clf=SGDClassifier(loss='log',alpha=0.000001,n_iter=100)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 111,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#clf = DecisionTreeClassifier(max_depth=5, min_samples_split=1, random_state=20)\n",
	"#RandomForestClassifier - \n",
	"clf2 = RandomForestClassifier(n_estimators=450, max_depth=10, min_samples_split=1, random_state=60)\n",
	"#clf = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=1, random_state=40)\n",
	"#ExtraTreesClassifier - 0.9808\n",
	"clf3 = ExtraTreesClassifier(n_estimators=350, max_depth=20, min_samples_split=2, random_state=160)\n",
	"#clf = ExtraTreesClassifier(n_estimators=100, max_depth=10, min_samples_split=1, random_state=200,bootstrap=True)\n",
	"#clf = ExtraTreesClassifier(n_estimators=200, max_depth=30, min_samples_split=4, random_state=200)\n",
	"#Nearest Neighbors Classifier\n",
	"#clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)\n",
	"#clf = neighbors.KNeighborsClassifier()\n",
	"#Decision Tree Classifier\n",
	"#clf = tree.DecisionTreeClassifier()\n",
	"#Adaboost\n",
	"#clf3 = AdaBoostClassifier(n_estimators=100)\n",
	"#GradientBoostingClassifier# - \n",
	"clf4 = GradientBoostingClassifier(n_estimators=600, learning_rate=1.0, max_depth=3, random_state=650)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 113,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"X = train[features]\n",
	"Y = train['signal']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 114,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"X_train, X_test, Y_train, Y_test = train_test_split(X,Y)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 115,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"GradientBoostingClassifier(init=None, learning_rate=1.0, loss='deviance',\n",
	" max_depth=3, max_features=None, max_leaf_nodes=None,\n",
	" min_samples_leaf=1, min_samples_split=2,\n",
	" min_weight_fraction_leaf=0.0, n_estimators=600,\n",
	" random_state=650, subsample=1.0, verbose=0, warm_start=False)"
	]
	},
	"execution_count": 115,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"clf1.fit(X_train,Y_train)\n",
	"clf2.fit(X_train,Y_train)\n",
	"clf3.fit(X_train,Y_train)\n",
	"clf4.fit(X_train,Y_train)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 116,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"0.858724613654\n",
	"0.849783883001\n",
	"0.841790514536\n",
	"0.848185209308\n"
	]
	}
	],
	"source": [
	"predicted = clf1.predict(X_test)\n",
	"expected = Y_test\n",
	"print metrics.accuracy_score(expected,predicted)\n",
	"predicted = clf2.predict(X_test)\n",
	"expected = Y_test\n",
	"print metrics.accuracy_score(expected,predicted)\n",
	"predicted = clf3.predict(X_test)\n",
	"expected = Y_test\n",
	"print metrics.accuracy_score(expected,predicted)\n",
	"predicted = clf4.predict(X_test)\n",
	"expected = Y_test\n",
	"print metrics.accuracy_score(expected,predicted)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 117,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Make predictions on the test set\n"
	]
	}
	],
	"source": [
	"print(\"Make predictions on the test set\")\n",
	"test_probs = (0.15clf1.predict_proba(test[features])[:,1]) + (0.15clf2.predict_proba(test[features])[:,1]) + (0.50clf3.predict_proba(test[features])[:,1]) + (0.20clf4.predict_proba(test[features])[:,1])\n",
	"submission = pd.DataFrame({\"id\": test[\"id\"], \"prediction\": test_probs})\n",
	"submission.to_csv(\"Physics\\\\Output\\\\Mixedmodel.csv\", index=False)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.10"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}