Skip to content

Instantly share code, notes, and snippets.

@Vikrant79
Created October 4, 2015 09:09
Show Gist options
  • Save Vikrant79/488abf986aa9d9399f87 to your computer and use it in GitHub Desktop.
Save Vikrant79/488abf986aa9d9399f87 to your computer and use it in GitHub Desktop.
Springleaf using xgb
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn import ensemble, preprocessing, cross_validation\n",
"from sklearn.metrics import roc_auc_score as auc\n",
"from time import time\n",
"from pandas import Series, DataFrame\n",
"from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\n",
"import sys\n",
"sys.path.append('C:\\\\Users\\\\Admin\\\\xgboost\\\\python-package')\n",
"import xgboost as xgb\n",
"from sklearn import ensemble, preprocessing, cross_validation"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Admin\\Anaconda\\lib\\site-packages\\pandas\\io\\parsers.py:1170: DtypeWarning: Columns (8,9,10,11,12,43,157,196,214,225,228,229,231,235,238) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = self._reader.read(nrows)\n",
"C:\\Users\\Admin\\Anaconda\\lib\\site-packages\\pandas\\io\\parsers.py:1170: DtypeWarning: Columns (8,9,10,11,12,43,157,167,177,196,214,225,228,229,231,235,238) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = self._reader.read(nrows)\n"
]
}
],
"source": [
"# PREPARE DATA\n",
"data = pd.read_csv('Input/train.csv').set_index(\"ID\")\n",
"test = pd.read_csv('Input/test.csv').set_index(\"ID\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# remove constants\n",
"nunique = pd.Series([data[col].nunique() for col in data.columns], index = data.columns)\n",
"constants = nunique[nunique<2].index.tolist()\n",
"data = data.drop(constants,axis=1)\n",
"test = test.drop(constants,axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\Admin\\Anaconda\\lib\\site-packages\\numpy\\lib\\arraysetops.py:198: FutureWarning: numpy not_equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change.\n",
" flag = np.concatenate(([True], aux[1:] != aux[:-1]))\n",
"C:\\Users\\Admin\\Anaconda\\lib\\site-packages\\numpy\\lib\\arraysetops.py:251: FutureWarning: numpy equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change.\n",
" return aux[:-1][aux[1:] == aux[:-1]]\n",
"C:\\Users\\Admin\\Anaconda\\lib\\site-packages\\numpy\\lib\\arraysetops.py:384: FutureWarning: numpy equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change.\n",
" bool_ar = (sar[1:] == sar[:-1])\n"
]
}
],
"source": [
"# encode string\n",
"strings = data.dtypes == 'object'; strings = strings[strings].index.tolist(); encoders = {}\n",
"for col in strings:\n",
" encoders[col] = preprocessing.LabelEncoder()\n",
" data[col] = encoders[col].fit_transform(data[col])\n",
" try:\n",
" test[col] = encoders[col].transform(test[col])\n",
" except:\n",
" # lazy way to incorporate the feature only if can be encoded in the test set\n",
" del test[col]\n",
" del data[col]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# DATA ready\n",
"#X1 = data.drop('target',1).fillna(0); y1 = data.target"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#X1 = data.drop('target',1).fillna(-9898989); y1 = data.target\n",
"X1 = data.drop('target',1).fillna(0); y1 = data.target"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# RF FTW :)\n",
"#rf = ensemble.RandomForestClassifier(n_jobs=4, n_estimators = 20, random_state = 11)\n",
"#rf = ensemble.RandomForestClassifier(n_jobs=500, n_estimators = 1000, random_state = 15)"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"print(\"Train a XGBoost model\")\n",
"params = {\"objective\": \"binary:logistic\",\n",
" \"eval_metric\": \"auc\",\n",
" \"eta\": 0.01,\n",
" \"max_depth\": 9,\n",
" \"min_child_weight\": 6,\n",
" \"silent\": 1,\n",
" \"subsample\": 0.7,\n",
" \"colsample_bytree\": 0.5,\n",
" \"alpha\": 4,\n",
" \"nthreads\": 3,\n",
" \"seed\": 1}\n",
"num_trees=7000\n",
"#num_trees=800\n",
"gbm = xgb.train(params, xgb.DMatrix(X1, y1), num_trees)"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": false
},
"source": [
"# LB Score .80022\n",
"print(\"Train a XGBoost model\")\n",
"params = {\"objective\": \"binary:logistic\",\n",
" \"eval_metric\": \"auc\",\n",
" \"eta\": 0.01,\n",
" \"max_depth\": 18,\n",
" \"min_child_weight\": 6,\n",
" \"silent\": 1,\n",
" \"subsample\": 0.65,\n",
" \"colsample_bytree\": 0.65,\n",
" \"alpha\": 4,\n",
" \"nthreads\": 3,\n",
" \"seed\": 4}\n",
"num_trees=8500\n",
"#num_trees=800\n",
"gbm = xgb.train(params, xgb.DMatrix(X1, y1), num_trees)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train a XGBoost model\n"
]
}
],
"source": [
"# 0.80055\n",
"print(\"Train a XGBoost model\")\n",
"params = {\"objective\": \"binary:logistic\",\n",
" \"eval_metric\": \"auc\",\n",
" \"eta\": 0.01,\n",
" \"max_depth\": 80,\n",
" \"min_child_weight\": 6,\n",
" \"silent\": 1,\n",
" \"subsample\": 0.7,\n",
" \"colsample_bytree\": 0.7,\n",
" \"alpha\": 4,\n",
" \"nthreads\": 3,\n",
" \"seed\": 8}\n",
"num_trees=9200\n",
"#num_trees=800\n",
"gbm = xgb.train(params, xgb.DMatrix(X1, y1), num_trees)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train a XGBoost model\n"
]
}
],
"source": [
"# 0.80100\n",
"print(\"Train a XGBoost model\")\n",
"params = {\"objective\": \"binary:logistic\",\n",
" \"eval_metric\": \"auc\",\n",
" \"eta\": 0.01,\n",
" \"max_depth\": 100,\n",
" \"min_child_weight\": 6,\n",
" \"silent\": 1,\n",
" \"subsample\": 0.7,\n",
" \"colsample_bytree\": 0.7,\n",
" \"alpha\": 3,\n",
" \"nthreads\": 4,\n",
" \"seed\": 8}\n",
"num_trees=10444\n",
"#num_trees=800\n",
"gbm = xgb.train(params, xgb.DMatrix(X1, y1), num_trees)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Make predictions on the test set\n"
]
}
],
"source": [
"print(\"Make predictions on the test set\")\n",
"#test_probs = (rf.predict_proba(test[features])[:,1] +\n",
"# gbm.predict(xgb.DMatrix(test[features])))/2\n",
"#test_probs = gbm.predict(xgb.DMatrix(test.fillna(0)))\n",
"#test_probs = gbm.predict(xgb.DMatrix(test.fillna(-9898989)))\n",
"test_probs = gbm.predict(xgb.DMatrix(test.fillna(0)))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"submission = pd.read_csv('Input/sample_submission.csv')\n",
"submission[\"target\"] = test_probs"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"submission.to_csv('Output/xgb_benchmark5.csv', index = False)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>target</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>0.392047</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3</td>\n",
" <td>0.321742</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>6</td>\n",
" <td>0.194858</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>9</td>\n",
" <td>0.312087</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>10</td>\n",
" <td>0.664403</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ID target\n",
"0 1 0.392047\n",
"1 3 0.321742\n",
"2 6 0.194858\n",
"3 9 0.312087\n",
"4 10 0.664403"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"submission.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment