Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save ocoyawale/e284fc8cde5a042c8cdd71a3b462f74c to your computer and use it in GitHub Desktop.
Save ocoyawale/e284fc8cde5a042c8cdd71a3b462f74c to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn import ensemble, preprocessing, cross_validation\n",
"from sklearn.metrics import roc_auc_score as auc\n",
"from time import time\n",
"from sklearn import svm\n",
"from sklearn.svm import SVC\n",
"from sklearn.linear_model import SGDClassifier\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.ensemble import ExtraTreesClassifier\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.ensemble import AdaBoostClassifier\n",
"from sklearn import neighbors, datasets\n",
"from sklearn import tree\n",
"from sklearn.datasets import make_hastie_10_2\n",
"from sklearn.ensemble import GradientBoostingClassifier\n",
"from pandas import Series,DataFrame\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"import seaborn as sns\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.cross_validation import train_test_split\n",
"from sklearn import metrics\n",
"import math\n",
"import statsmodels.api as sm\n",
"from datetime import datetime\n",
"import requests\n",
"from StringIO import StringIO\n",
"from time import time\n",
"import sys\n",
"sys.path.append('C:\\\\Users\\\\Vikrant\\\\xgboost\\\\python-package')\n",
"import xgboost as xgb\n",
"from datetime import datetime"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# PREPARE DATA\n",
"data = pd.read_csv('Physics\\\\Input\\\\training.csv')\n",
"test = pd.read_csv('Physics\\\\Input\\\\test.csv')"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"Y = data.signal"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>LifeTime</th>\n",
" <th>dira</th>\n",
" <th>FlightDistance</th>\n",
" <th>FlightDistanceError</th>\n",
" <th>IP</th>\n",
" <th>IPSig</th>\n",
" <th>VertexChi2</th>\n",
" <th>pt</th>\n",
" <th>DOCAone</th>\n",
" <th>...</th>\n",
" <th>p0_pt</th>\n",
" <th>p1_pt</th>\n",
" <th>p2_pt</th>\n",
" <th>p0_p</th>\n",
" <th>p1_p</th>\n",
" <th>p2_p</th>\n",
" <th>p0_eta</th>\n",
" <th>p1_eta</th>\n",
" <th>p2_eta</th>\n",
" <th>SPDhits</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>14711831</td>\n",
" <td>0.001273</td>\n",
" <td>0.999816</td>\n",
" <td>8.569642</td>\n",
" <td>0.310039</td>\n",
" <td>0.197893</td>\n",
" <td>5.183939</td>\n",
" <td>2.286117</td>\n",
" <td>687.479248</td>\n",
" <td>0.022301</td>\n",
" <td>...</td>\n",
" <td>508.926514</td>\n",
" <td>306.629456</td>\n",
" <td>714.222717</td>\n",
" <td>30300.734375</td>\n",
" <td>5662.158691</td>\n",
" <td>7406.368164</td>\n",
" <td>4.779700</td>\n",
" <td>3.608334</td>\n",
" <td>3.029715</td>\n",
" <td>705</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>16316387</td>\n",
" <td>0.000661</td>\n",
" <td>0.999954</td>\n",
" <td>7.061379</td>\n",
" <td>0.348772</td>\n",
" <td>0.067430</td>\n",
" <td>6.316281</td>\n",
" <td>3.865824</td>\n",
" <td>5691.845703</td>\n",
" <td>0.002455</td>\n",
" <td>...</td>\n",
" <td>943.130676</td>\n",
" <td>3287.291260</td>\n",
" <td>1563.593994</td>\n",
" <td>6160.323730</td>\n",
" <td>43781.566406</td>\n",
" <td>19728.500000</td>\n",
" <td>2.563915</td>\n",
" <td>3.280884</td>\n",
" <td>3.226650</td>\n",
" <td>907</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>6771382</td>\n",
" <td>0.000889</td>\n",
" <td>0.999999</td>\n",
" <td>6.502967</td>\n",
" <td>0.273914</td>\n",
" <td>0.011020</td>\n",
" <td>0.865816</td>\n",
" <td>1.004631</td>\n",
" <td>5429.225586</td>\n",
" <td>0.002753</td>\n",
" <td>...</td>\n",
" <td>2886.055908</td>\n",
" <td>2216.936279</td>\n",
" <td>598.068359</td>\n",
" <td>26387.134766</td>\n",
" <td>15534.978516</td>\n",
" <td>6504.145020</td>\n",
" <td>2.903129</td>\n",
" <td>2.634985</td>\n",
" <td>3.077517</td>\n",
" <td>213</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>686045</td>\n",
" <td>0.001295</td>\n",
" <td>0.999301</td>\n",
" <td>7.838184</td>\n",
" <td>0.325697</td>\n",
" <td>0.388636</td>\n",
" <td>11.391245</td>\n",
" <td>10.345356</td>\n",
" <td>661.717834</td>\n",
" <td>0.008163</td>\n",
" <td>...</td>\n",
" <td>461.727753</td>\n",
" <td>357.967560</td>\n",
" <td>1118.084229</td>\n",
" <td>5451.857422</td>\n",
" <td>10281.403320</td>\n",
" <td>23722.742188</td>\n",
" <td>3.160085</td>\n",
" <td>4.050494</td>\n",
" <td>3.747409</td>\n",
" <td>275</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>8755882</td>\n",
" <td>0.001714</td>\n",
" <td>0.999805</td>\n",
" <td>7.064559</td>\n",
" <td>0.113440</td>\n",
" <td>0.154421</td>\n",
" <td>13.925871</td>\n",
" <td>12.951378</td>\n",
" <td>3885.417969</td>\n",
" <td>0.066317</td>\n",
" <td>...</td>\n",
" <td>1773.747925</td>\n",
" <td>1643.278198</td>\n",
" <td>711.572205</td>\n",
" <td>7724.954590</td>\n",
" <td>12830.546875</td>\n",
" <td>6008.145996</td>\n",
" <td>2.151059</td>\n",
" <td>2.744157</td>\n",
" <td>2.823017</td>\n",
" <td>357</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 47 columns</p>\n",
"</div>"
],
"text/plain": [
" id LifeTime dira FlightDistance FlightDistanceError \\\n",
"0 14711831 0.001273 0.999816 8.569642 0.310039 \n",
"1 16316387 0.000661 0.999954 7.061379 0.348772 \n",
"2 6771382 0.000889 0.999999 6.502967 0.273914 \n",
"3 686045 0.001295 0.999301 7.838184 0.325697 \n",
"4 8755882 0.001714 0.999805 7.064559 0.113440 \n",
"\n",
" IP IPSig VertexChi2 pt DOCAone ... \\\n",
"0 0.197893 5.183939 2.286117 687.479248 0.022301 ... \n",
"1 0.067430 6.316281 3.865824 5691.845703 0.002455 ... \n",
"2 0.011020 0.865816 1.004631 5429.225586 0.002753 ... \n",
"3 0.388636 11.391245 10.345356 661.717834 0.008163 ... \n",
"4 0.154421 13.925871 12.951378 3885.417969 0.066317 ... \n",
"\n",
" p0_pt p1_pt p2_pt p0_p p1_p \\\n",
"0 508.926514 306.629456 714.222717 30300.734375 5662.158691 \n",
"1 943.130676 3287.291260 1563.593994 6160.323730 43781.566406 \n",
"2 2886.055908 2216.936279 598.068359 26387.134766 15534.978516 \n",
"3 461.727753 357.967560 1118.084229 5451.857422 10281.403320 \n",
"4 1773.747925 1643.278198 711.572205 7724.954590 12830.546875 \n",
"\n",
" p2_p p0_eta p1_eta p2_eta SPDhits \n",
"0 7406.368164 4.779700 3.608334 3.029715 705 \n",
"1 19728.500000 2.563915 3.280884 3.226650 907 \n",
"2 6504.145020 2.903129 2.634985 3.077517 213 \n",
"3 23722.742188 3.160085 4.050494 3.747409 275 \n",
"4 6008.145996 2.151059 2.744157 2.823017 357 \n",
"\n",
"[5 rows x 47 columns]"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.head()"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# DATA ready\n",
"X = data.drop('signal', 1)\n",
"X = X.drop('mass', 1)\n",
"X = X.drop('production', 1)\n",
"X = X.drop('min_ANNmuon', 1)\n",
"X = X.drop('id', 1)\n",
"X = X.fillna(0)\n",
"test1 = test.drop('id', 1)\n",
"#X = preprocessing.scale(X)\n",
"#test = preprocessing.scale(test)"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#X = preprocessing.normalize(X)\n",
"#test1 = preprocessing.normalize(test1)\n",
"#X = preprocessing.scale(X)\n",
"#test1 = preprocessing.scale(test1)\n",
"X = preprocessing.MinMaxScaler().fit_transform(X)\n",
"test1 = preprocessing.MinMaxScaler().fit_transform(test1)"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"X = X.astype(float)\n",
"test1 = test1.astype(float)"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# RF FTW :)\n",
"#clf = ensemble.RandomForestClassifier(n_jobs=30, n_estimators = 70, random_state = 25)\n",
"#clf = ensemble.RandomForestClassifier(n_jobs=10, n_estimators = 50, random_state = 15)\n",
"#SVC\n",
"#clf = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)\n",
"#clf = svm.SVC(C=2.0, cache_size=200, class_weight=1, coef0=0.0, degree=3, gamma=0.0, kernel='rbf', max_iter=-1, probability=True, random_state=4, shrinking=True, tol=0.001, verbose=False)\n",
"#clf = svm.SVC(probability=True)\n",
"#Stochastic Gradient Descent\n",
"#clf = SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, verbose=0, warm_start=False)\n",
"#clf=SGDClassifier(loss='log',alpha=0.000001,n_iter=100)"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#clf = DecisionTreeClassifier(max_depth=5, min_samples_split=1, random_state=20)\n",
"#RandomForestClassifier - \n",
"#clf = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=1, random_state=50)\n",
"#clf = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=1, random_state=40)\n",
"#ExtraTreesClassifier - 0.9808\n",
"clf = ExtraTreesClassifier(n_estimators=150, max_depth=20, min_samples_split=2, random_state=100)\n",
"#clf = ExtraTreesClassifier(n_estimators=100, max_depth=10, min_samples_split=1, random_state=200,bootstrap=True)\n",
"#clf = ExtraTreesClassifier(n_estimators=200, max_depth=30, min_samples_split=4, random_state=200)\n",
"#Nearest Neighbors Classifier\n",
"#clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)\n",
"#clf = neighbors.KNeighborsClassifier()\n",
"#Decision Tree Classifier\n",
"#clf = tree.DecisionTreeClassifier()\n",
"#Adaboost\n",
"#clf = AdaBoostClassifier(n_estimators=100)\n",
"#GradientBoostingClassifier# - \n",
"#clf = GradientBoostingClassifier(n_estimators=400, learning_rate=1.0, max_depth=3, random_state=200)"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# CROSS VALIDATE AND PRINT TRAIN AND TEST SCORE\n",
"#kf = cross_validation.StratifiedKFold(y, n_folds=5, shuffle=True, random_state=11)\n",
"#trscores, cvscores, times = [], [], []\n",
"#for itr, icv in kf:\n",
"# t = time()\n",
"# trscore = auc(y.iloc[itr], clf.fit(X.iloc[itr], y.iloc[itr]).predict_proba(X.iloc[itr])[:,1])\n",
"# cvscore = auc(y.iloc[icv], clf.predict_proba(X.iloc[icv])[:,1])\n",
"# trscores.append(trscore); cvscores.append(cvscore); times.append(time()-t)\n",
"#print \"TRAIN %.4f | TEST %.4f | TIME %.2fm (1-fold)\" % (np.mean(trscores), np.mean(cvscores), np.mean(times)/60)"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"X_train, X_test, Y_train, Y_test = train_test_split(X,Y)"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',\n",
" max_depth=20, max_features='auto', max_leaf_nodes=None,\n",
" min_samples_leaf=1, min_samples_split=2,\n",
" min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=1,\n",
" oob_score=False, random_state=100, verbose=0, warm_start=False)"
]
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clf.fit(X_train,Y_train)"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.852270708745\n"
]
}
],
"source": [
"predicted = clf.predict(X_test)\n",
"expected = Y_test\n",
"print metrics.accuracy_score(expected,predicted)"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# MAKING SUBMISSION\n",
"submission = pd.DataFrame(clf.fit(X,Y).predict_proba(test1)[:,1], index=test.id, columns=['prediction'])\n",
"submission.index.name = 'id'\n",
"submission.to_csv('Physics\\\\Output\\\\Results.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"submission = pd.DataFrame(clf.fit(X,Y).predict_proba(test1)[:,1], index=test.id, columns=['prediction'])\n",
"submission.index.name = 'id'\n",
"submission.to_csv('Physics\\\\Output\\\\Results.csv')"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"xgb_params = {\"objective\": \"binary:logistic\", \"max_depth\": 15, \"silent\": 1}\n",
"num_rounds = 400"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"dtrain = xgb.DMatrix(X, label=Y)\n",
"dtest = xgb.DMatrix(test1)\n",
"gbdt = xgb.train(xgb_params, dtrain, num_rounds)\n",
"preds = gbdt.predict(dtest)"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"submission = pd.DataFrame({\"ID\": test.id, \"prediction\": preds})\n",
"submission = submission.set_index('ID')\n",
"submission.to_csv('Physics\\\\Output\\\\Results1.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment