Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save Vikrant79/d14da7541a364e6232b7 to your computer and use it in GitHub Desktop.
Save Vikrant79/d14da7541a364e6232b7 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn import ensemble, preprocessing, cross_validation\n",
"from sklearn.metrics import roc_auc_score as auc\n",
"from time import time\n",
"from sklearn import svm\n",
"from sklearn.svm import SVC\n",
"from sklearn.linear_model import SGDClassifier\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.ensemble import ExtraTreesClassifier\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.ensemble import AdaBoostClassifier\n",
"from sklearn import neighbors, datasets\n",
"from sklearn import tree\n",
"from sklearn.datasets import make_hastie_10_2\n",
"from sklearn.ensemble import GradientBoostingClassifier\n",
"from pandas import Series,DataFrame\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"import seaborn as sns\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.cross_validation import train_test_split\n",
"from sklearn import metrics\n",
"import math\n",
"import statsmodels.api as sm\n",
"from datetime import datetime\n",
"import requests\n",
"from StringIO import StringIO\n",
"from time import time\n",
"import sys\n",
"sys.path.append('C:\\\\Users\\\\Vikrant\\\\xgboost\\\\python-package')\n",
"import xgboost as xgb\n",
"from datetime import datetime"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Vikrant\\Anaconda\\lib\\site-packages\\pandas\\io\\parsers.py:1170: DtypeWarning: Columns (8,9,10,11,12,43,157,196,214,225,228,229,231,235,238) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = self._reader.read(nrows)\n",
"C:\\Users\\Vikrant\\Anaconda\\lib\\site-packages\\pandas\\io\\parsers.py:1170: DtypeWarning: Columns (8,9,10,11,12,43,157,167,177,196,214,225,228,229,231,235,238) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = self._reader.read(nrows)\n"
]
}
],
"source": [
"# PREPARE DATA\n",
"data = pd.read_csv('train_Spring.csv').set_index(\"ID\")\n",
"test = pd.read_csv('test_Spring.csv').set_index(\"ID\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# remove constants\n",
"nunique = pd.Series([data[col].nunique() for col in data.columns], index = data.columns)\n",
"constants = nunique[nunique<2].index.tolist()\n",
"data = data.drop(constants,axis=1)\n",
"test = test.drop(constants,axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Vikrant\\Anaconda\\lib\\site-packages\\numpy\\lib\\arraysetops.py:198: FutureWarning: numpy not_equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change.\n",
" flag = np.concatenate(([True], aux[1:] != aux[:-1]))\n",
"C:\\Users\\Vikrant\\Anaconda\\lib\\site-packages\\numpy\\lib\\arraysetops.py:251: FutureWarning: numpy equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change.\n",
" return aux[:-1][aux[1:] == aux[:-1]]\n",
"C:\\Users\\Vikrant\\Anaconda\\lib\\site-packages\\numpy\\lib\\arraysetops.py:384: FutureWarning: numpy equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change.\n",
" bool_ar = (sar[1:] == sar[:-1])\n"
]
}
],
"source": [
"# encode string\n",
"strings = data.dtypes == 'object'; strings = strings[strings].index.tolist(); encoders = {}\n",
"for col in strings:\n",
" encoders[col] = preprocessing.LabelEncoder()\n",
" data[col] = encoders[col].fit_transform(data[col])\n",
" try:\n",
" test[col] = encoders[col].transform(test[col])\n",
" except:\n",
" # lazy way to incorporate the feature only if can be encoded in the test set\n",
" del test[col]\n",
" del data[col]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# DATA ready\n",
"X = data.drop('target',1).fillna(0); y = data.target"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# RF FTW :)\n",
"#clf = ensemble.RandomForestClassifier(n_jobs=4, n_estimators = 20, random_state = 11)\n",
"#clf = ensemble.RandomForestClassifier(n_jobs=500, n_estimators = 1000, random_state = 15)\n",
"#SVC\n",
"#clf = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)\n",
"#clf = svm.SVC(C=2.0, cache_size=200, class_weight=1, coef0=0.0, degree=3, gamma=0.0, kernel='rbf', max_iter=-1, probability=True, random_state=4, shrinking=True, tol=0.001, verbose=False)\n",
"#Stochastic Gradient Descent\n",
"#clf = SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, verbose=0, warm_start=False)\n",
"#clf=SGDClassifier(loss='log',alpha=0.000001,n_iter=100)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#clf = DecisionTreeClassifier(max_depth=5, min_samples_split=1, random_state=20)\n",
"#RandomForestClassifier - 0.75985\n",
"#clf = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=1, random_state=50)\n",
"#clf = RandomForestClassifier(n_estimators=400, max_depth=20, min_samples_split=1, random_state=200)\n",
"#ExtraTreesClassifier - 0.76\n",
"clf1 = ExtraTreesClassifier(n_estimators=600, max_depth=20, min_samples_split=2, random_state=500)\n",
"#clf = ExtraTreesClassifier(n_estimators=200, max_depth=30, min_samples_split=4, random_state=200)\n",
"#Nearest Neighbors Classifier\n",
"#clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)\n",
"#clf = neighbors.KNeighborsClassifier()\n",
"#Decision Tree Classifier\n",
"#clf = tree.DecisionTreeClassifier()\n",
"#Adaboost\n",
"#clf = AdaBoostClassifier(n_estimators=100)\n",
"#GradientBoostingClassifier# - 0.77459\n",
"clf2 = GradientBoostingClassifier(n_estimators=600, learning_rate=1.0, max_depth=1, random_state=600)\n",
"#clf = ensemble.GradientBoostingClassifier(n_estimators=1000, learning_rate=1.0, max_depth=1, random_state=800)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"X_train, X_test, Y_train, Y_test = train_test_split(X,y)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"GradientBoostingClassifier(init=None, learning_rate=1.0, loss='deviance',\n",
" max_depth=1, max_features=None, max_leaf_nodes=None,\n",
" min_samples_leaf=1, min_samples_split=2,\n",
" min_weight_fraction_leaf=0.0, n_estimators=1000,\n",
" random_state=800, subsample=1.0, verbose=0, warm_start=False)"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#clf.fit(X_train,Y_train)\n",
"clf1.fit(X_train,Y_train)\n",
"clf2.fit(X_train,Y_train2"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.788806874518\n"
]
}
],
"source": [
"predicted = clf1.predict(X_test)\n",
"expected = Y_test\n",
"print metrics.accuracy_score(expected,predicted)\n",
"predicted = clf2.predict(X_test)\n",
"expected = Y_test\n",
"print metrics.accuracy_score(expected,predicted)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"submission = pd.DataFrame(clf.fit(X,y).predict_proba(test.fillna(0))[:,1], index=test.index, columns=['target'])\n",
"submission.index.name = 'ID'\n",
"submission.to_csv('Springleaf\\\\Output\\\\Springleaf.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#test_probs = pd.DataFrame((0.4*clf1.fit(X,y).predict_proba(test.fillna(0))[:,1], index=test.index, columns=['target']) + (0.6*clf2.fit(X,y).predict_proba(test.fillna(0))[:,1], index=test.index, columns=['target']))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# CROSS VALIDATE AND PRINT TRAIN AND TEST SCORE\n",
"#kf = cross_validation.StratifiedKFold(y, n_folds=5, shuffle=True, random_state=11)\n",
"#trscores, cvscores, times = [], [], []\n",
"#for itr, icv in kf:\n",
"# t = time()\n",
"# trscore = auc(y.iloc[itr], clf.fit(X.iloc[itr], y.iloc[itr]).predict_proba(X.iloc[itr])[:,1])\n",
"# cvscore = auc(y.iloc[icv], clf.predict_proba(X.iloc[icv])[:,1])\n",
"# trscores.append(trscore); cvscores.append(cvscore); times.append(time()-t)\n",
"#print \"TRAIN %.4f | TEST %.4f | TIME %.2fm (1-fold)\" % (np.mean(trscores), np.mean(cvscores), np.mean(times)/60)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# MAKING SUBMISSION\n",
"#submission = pd.DataFrame(clf.fit(X,y).predict_proba(test.fillna(0))[:,1], index=test.index, columns=['target'])\n",
"#submission.index.name = 'ID'\n",
"#submission.to_csv('Springleaf10.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment