pronojitsaha/XGB2.ipynb

## XGB2.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# suppress pandas warnings\n",
    "import warnings\n",
    "warnings.simplefilter(action = \"ignore\", category = RuntimeWarning)\n",
    "warnings.simplefilter(action = \"ignore\", category = FutureWarning)\n",
    "\n",
    "# imports\n",
    "import sys\n",
    "import xgboost as xgb\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn import preprocessing\n",
    "import pandas as pd\n",
    "from numpy.random import seed\n",
    "from sklearn.cross_validation import StratifiedShuffleSplit\n",
    "\n",
    "# reproduce results\n",
    "seed(786)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((8123, 32), (2032, 31))"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Read the files\n",
    "train = pd.read_csv('data/train_K9K1f9B.csv')\n",
    "test = pd.read_csv('data/test_yIjzS7t.csv')\n",
    "train.shape, test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "unknown_cols = ['status', 'occupation', 'occupation_partner', 'region']\n",
    "for col in unknown_cols:\n",
    "    train.ix[train[col] == 'Unknown',col] = float('nan')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "I treated the unknown values in different features as missing value as well.   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#For test set\n",
    "for col in unknown_cols:\n",
    "    test.ix[test[col] == 'Unknown',col] = float('nan')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Missing values was not given any special treatment as XGB has its own way of treating them, so we leave it to the algorithm to decide the best. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The feature 'post_area' had lot of levels, so treated the less frequent areas as one (i.e. others). "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "pa_counts = train.post_area.value_counts(dropna=True)\n",
    "pa_counts_rare = list(pa_counts[pa_counts<5].index)\n",
    "train.ix[train['post_area'].isin(pa_counts_rare), \"post_area\"] = \"Others\"\n",
    "test.ix[test['post_area'].isin(pa_counts_rare), \"post_area\"] = \"Others\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For test set, we take the new values of 'post_area' which are not in train set as 'Others' too. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "newpostareas = list(set(test['post_area']) - set(train['post_area']))\n",
    "test.ix[test['post_area'].isin(newpostareas), \"post_area\"] = \"Others\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data = train.copy()\n",
    "label = data['Account.Status'].map({'Silver': 0, 'Gold': 1})\n",
    "\n",
    "#I dropped the feature 'post_code' since its highest frequency was 2 and hence would not add any value to the model.\n",
    "dropCols = ['REF_NO', 'Account.Status', 'post_code']\n",
    "data.drop(dropCols, axis=1, inplace = True)\n",
    "\n",
    "y = label\n",
    "X = pd.get_dummies(data) #converted the categorical features into 2 level factor variables. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#For test\n",
    "test2 = test.copy()\n",
    "testdropcols = list(set(dropCols)-set(['Account.Status']))\n",
    "test2 = test.drop(testdropcols, axis=1)\n",
    "\n",
    "Final_test = pd.get_dummies(test2)\n",
    "missingCols = list(set(X.columns)-set(Final_test.columns))\n",
    "for col in missingCols:\n",
    "    Final_test[col] = 0\n",
    "Final_test = Final_test[X.columns]\n",
    "assert X.columns.equals(Final_test.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    7255\n",
       "1     868\n",
       "dtype: int64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "label.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "StratifiedShuffleSplit(labels=[0 0 0 ..., 0 0 0], n_iter=1, test_size=0.25, random_state=0)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Creating a hold out set using stratified sampling as our target variable is skewed\n",
    "holdout_fold = StratifiedShuffleSplit(y, n_iter=1, test_size=0.25, random_state=0)\n",
    "holdout_fold"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "for train_index, holdout_index in holdout_fold:\n",
    "    X_train = X.ix[train_index]\n",
    "    X_test = X.ix[holdout_index]\n",
    "    y_train = y[train_index]\n",
    "    y_test = y[holdout_index]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Define the evaluation function\n",
    "def evaluation(y_predicted, y_true):\n",
    "    Ns = sum([1 if (a == 0 and b == 1) else 0 for (a,b) in zip(y_predicted,y_true)])\n",
    "    Ng = sum([1 if (a == 1 and b == 0) else 0 for (a,b) in zip(y_predicted,y_true)])\n",
    "    T = len(y_true)\n",
    "    M = (8*Ns + 2*Ng)/float(T)\n",
    "    return M"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#finding the ideal learning rate and num_rounds\n",
    "params = {}\n",
    "params[\"objective\"] = \"binary:logistic\"\n",
    "params[\"max_depth\"] = 10\n",
    "params[\"eta\"] = 0.01 #higher is more conservative [0,1], if reduced then increase num_rounds\n",
    "params[\"eval_metric \"] = 'logloss'\n",
    "params[\"seed\"] = 0\n",
    "params[\"silent\"] = 1\n",
    "plst = list(params.items())\n",
    "num_rounds = 10000\n",
    "\n",
    "xgtrain = xgb.DMatrix(X_train, label=y_train) #weight= trainX_mobN_weight\n",
    "#xgb.cv(params, xgtrain, num_rounds, nfold=4, metrics={'logloss'})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "xgb.cv gives the following output"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[1074]\tcv-test-logloss:0.097280+0.003721\tcv-train-logloss:0.028058+0.001354    \n",
    "[1075]\tcv-test-logloss:0.097280+0.003718\tcv-train-logloss:0.028017+0.001332    \n",
    "[1076]\tcv-test-logloss:0.097291+0.003725\tcv-train-logloss:0.027987+0.001338    \n",
    "[1077]\tcv-test-logloss:0.097279+0.003724\tcv-train-logloss:0.027958+0.001339    \n",
    "[1078]\tcv-test-logloss:0.097273+0.003725\tcv-train-logloss:0.027924+0.001347    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "As one can see from above the overfitting starts at around tree number 1077."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#give higher weights to the labels 1 as the target variable is skewed\n",
    "Xtrain_weight = [2 if data == 1 else 1 for data in y_train]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Having determined the ideal learning rate and num_rounds, we tune the other parameters\n",
    "params = {}\n",
    "params[\"objective\"] = \"binary:logistic\"\n",
    "#To avoid overfitting: The first way is to directly control model complexity\n",
    "params[\"min_child_weight\"] = 3 #The larger, the more conservative the algorithm will be.\n",
    "params[\"max_depth\"] = 10\n",
    "#params[\"gamma\"] = 0 #The larger, the more conservative the algorithm will be.\n",
    "params[\"eta\"] = 0.01 #higher is more conservative [0,1], if reduced then increase num_rounds\n",
    "#The second way is to add randomness to make training robust to noise\n",
    "params[\"subsample\"] = 0.9\n",
    "params[\"colsample_bytree\"] = 0.9\n",
    "\n",
    "#Handle Imbalanced Dataset\n",
    "#If you care only about the ranking order (AUC) of your prediction\n",
    "#params[\"scale_pos_weight\"] = 1 #ratio of labels in target variable\n",
    "params[\"eval_metric \"] = 'logloss'\n",
    "#If you care about predicting the right probability\n",
    "params[\"max_delta_step\"]= 8 #should be high for skewed data\n",
    "\n",
    "params[\"seed\"] = 0\n",
    "params[\"silent\"] = 1\n",
    "params[\"nthread\"] = 4\n",
    "plst = list(params.items())\n",
    "num_rounds = 1100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "xgtrain = xgb.DMatrix(X_train, label=y_train, weight= Xtrain_weight)\n",
    "xgtest = xgb.DMatrix(X_test)\n",
    "model = xgb.train(plst, xgtrain, num_rounds)\n",
    "pred_ytest = model.predict(xgtest)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In our evaluation function we have a hgher penalty for misclassification as silver i.e. 0. So it will be better for our model to predict more number of gold i.e. 1 and go wrong on them than predict more number of silver and go wrong on them. So we chose a low sensitivity of our model for the prediction (by trial and error), in our case it is 0.30. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "predictions = [1 if pred > 0.30 else 0 for pred in pred_ytest]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.12506154603643527"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "evaluation(predictions,y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Predict on the test set\n",
    "X_weight = [2 if data == 1 else 1 for data in y]\n",
    "xgtrain = xgb.DMatrix(X, label=y, weight= X_weight)\n",
    "xgtest = xgb.DMatrix(Final_test)\n",
    "model_full = xgb.train(plst, xgtrain, num_rounds)\n",
    "pred_Finaltest = model_full.predict(xgtest)\n",
    "predictions_final = ['Gold' if pred > 0.30 else 'Silver' for pred in pred_Finaltest]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We save our data for ensembling with the other XGB model that we built. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "to_ensemble = pd.DataFrame({ 'REF_NO': test['REF_NO'], 'Account.Status':pred_Finaltest})\n",
    "to_ensemble = to_ensemble[['REF_NO', 'Account.Status']]\n",
    "to_ensemble.to_csv(\"data/subXGB2.csv\", index = False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# suppress pandas warnings\n",
	"import warnings\n",
	"warnings.simplefilter(action = \"ignore\", category = RuntimeWarning)\n",
	"warnings.simplefilter(action = \"ignore\", category = FutureWarning)\n",
	"\n",
	"# imports\n",
	"import sys\n",
	"import xgboost as xgb\n",
	"import pandas as pd\n",
	"import numpy as np\n",
	"from sklearn import preprocessing\n",
	"import pandas as pd\n",
	"from numpy.random import seed\n",
	"from sklearn.cross_validation import StratifiedShuffleSplit\n",
	"\n",
	"# reproduce results\n",
	"seed(786)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"((8123, 32), (2032, 31))"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"#Read the files\n",
	"train = pd.read_csv('data/train_K9K1f9B.csv')\n",
	"test = pd.read_csv('data/test_yIjzS7t.csv')\n",
	"train.shape, test.shape"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"unknown_cols = ['status', 'occupation', 'occupation_partner', 'region']\n",
	"for col in unknown_cols:\n",
	" train.ix[train[col] == 'Unknown',col] = float('nan')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"I treated the unknown values in different features as missing value as well. "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#For test set\n",
	"for col in unknown_cols:\n",
	" test.ix[test[col] == 'Unknown',col] = float('nan')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Missing values was not given any special treatment as XGB has its own way of treating them, so we leave it to the algorithm to decide the best. "
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"The feature 'post_area' had lot of levels, so treated the less frequent areas as one (i.e. others). "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"pa_counts = train.post_area.value_counts(dropna=True)\n",
	"pa_counts_rare = list(pa_counts[pa_counts<5].index)\n",
	"train.ix[train['post_area'].isin(pa_counts_rare), \"post_area\"] = \"Others\"\n",
	"test.ix[test['post_area'].isin(pa_counts_rare), \"post_area\"] = \"Others\""
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"For test set, we take the new values of 'post_area' which are not in train set as 'Others' too. "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"newpostareas = list(set(test['post_area']) - set(train['post_area']))\n",
	"test.ix[test['post_area'].isin(newpostareas), \"post_area\"] = \"Others\""
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"data = train.copy()\n",
	"label = data['Account.Status'].map({'Silver': 0, 'Gold': 1})\n",
	"\n",
	"#I dropped the feature 'post_code' since its highest frequency was 2 and hence would not add any value to the model.\n",
	"dropCols = ['REF_NO', 'Account.Status', 'post_code']\n",
	"data.drop(dropCols, axis=1, inplace = True)\n",
	"\n",
	"y = label\n",
	"X = pd.get_dummies(data) #converted the categorical features into 2 level factor variables. "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#For test\n",
	"test2 = test.copy()\n",
	"testdropcols = list(set(dropCols)-set(['Account.Status']))\n",
	"test2 = test.drop(testdropcols, axis=1)\n",
	"\n",
	"Final_test = pd.get_dummies(test2)\n",
	"missingCols = list(set(X.columns)-set(Final_test.columns))\n",
	"for col in missingCols:\n",
	" Final_test[col] = 0\n",
	"Final_test = Final_test[X.columns]\n",
	"assert X.columns.equals(Final_test.columns)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0 7255\n",
	"1 868\n",
	"dtype: int64"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"label.value_counts()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"StratifiedShuffleSplit(labels=[0 0 0 ..., 0 0 0], n_iter=1, test_size=0.25, random_state=0)"
	]
	},
	"execution_count": 10,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"#Creating a hold out set using stratified sampling as our target variable is skewed\n",
	"holdout_fold = StratifiedShuffleSplit(y, n_iter=1, test_size=0.25, random_state=0)\n",
	"holdout_fold"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"for train_index, holdout_index in holdout_fold:\n",
	" X_train = X.ix[train_index]\n",
	" X_test = X.ix[holdout_index]\n",
	" y_train = y[train_index]\n",
	" y_test = y[holdout_index]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#Define the evaluation function\n",
	"def evaluation(y_predicted, y_true):\n",
	" Ns = sum([1 if (a == 0 and b == 1) else 0 for (a,b) in zip(y_predicted,y_true)])\n",
	" Ng = sum([1 if (a == 1 and b == 0) else 0 for (a,b) in zip(y_predicted,y_true)])\n",
	" T = len(y_true)\n",
	" M = (8Ns + 2Ng)/float(T)\n",
	" return M"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#finding the ideal learning rate and num_rounds\n",
	"params = {}\n",
	"params[\"objective\"] = \"binary:logistic\"\n",
	"params[\"max_depth\"] = 10\n",
	"params[\"eta\"] = 0.01 #higher is more conservative [0,1], if reduced then increase num_rounds\n",
	"params[\"eval_metric \"] = 'logloss'\n",
	"params[\"seed\"] = 0\n",
	"params[\"silent\"] = 1\n",
	"plst = list(params.items())\n",
	"num_rounds = 10000\n",
	"\n",
	"xgtrain = xgb.DMatrix(X_train, label=y_train) #weight= trainX_mobN_weight\n",
	"#xgb.cv(params, xgtrain, num_rounds, nfold=4, metrics={'logloss'})"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"xgb.cv gives the following output"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"[1074]\tcv-test-logloss:0.097280+0.003721\tcv-train-logloss:0.028058+0.001354 \n",
	"[1075]\tcv-test-logloss:0.097280+0.003718\tcv-train-logloss:0.028017+0.001332 \n",
	"[1076]\tcv-test-logloss:0.097291+0.003725\tcv-train-logloss:0.027987+0.001338 \n",
	"[1077]\tcv-test-logloss:0.097279+0.003724\tcv-train-logloss:0.027958+0.001339 \n",
	"[1078]\tcv-test-logloss:0.097273+0.003725\tcv-train-logloss:0.027924+0.001347 "
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"As one can see from above the overfitting starts at around tree number 1077."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#give higher weights to the labels 1 as the target variable is skewed\n",
	"Xtrain_weight = [2 if data == 1 else 1 for data in y_train]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#Having determined the ideal learning rate and num_rounds, we tune the other parameters\n",
	"params = {}\n",
	"params[\"objective\"] = \"binary:logistic\"\n",
	"#To avoid overfitting: The first way is to directly control model complexity\n",
	"params[\"min_child_weight\"] = 3 #The larger, the more conservative the algorithm will be.\n",
	"params[\"max_depth\"] = 10\n",
	"#params[\"gamma\"] = 0 #The larger, the more conservative the algorithm will be.\n",
	"params[\"eta\"] = 0.01 #higher is more conservative [0,1], if reduced then increase num_rounds\n",
	"#The second way is to add randomness to make training robust to noise\n",
	"params[\"subsample\"] = 0.9\n",
	"params[\"colsample_bytree\"] = 0.9\n",
	"\n",
	"#Handle Imbalanced Dataset\n",
	"#If you care only about the ranking order (AUC) of your prediction\n",
	"#params[\"scale_pos_weight\"] = 1 #ratio of labels in target variable\n",
	"params[\"eval_metric \"] = 'logloss'\n",
	"#If you care about predicting the right probability\n",
	"params[\"max_delta_step\"]= 8 #should be high for skewed data\n",
	"\n",
	"params[\"seed\"] = 0\n",
	"params[\"silent\"] = 1\n",
	"params[\"nthread\"] = 4\n",
	"plst = list(params.items())\n",
	"num_rounds = 1100"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"xgtrain = xgb.DMatrix(X_train, label=y_train, weight= Xtrain_weight)\n",
	"xgtest = xgb.DMatrix(X_test)\n",
	"model = xgb.train(plst, xgtrain, num_rounds)\n",
	"pred_ytest = model.predict(xgtest)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"In our evaluation function we have a hgher penalty for misclassification as silver i.e. 0. So it will be better for our model to predict more number of gold i.e. 1 and go wrong on them than predict more number of silver and go wrong on them. So we chose a low sensitivity of our model for the prediction (by trial and error), in our case it is 0.30. "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"predictions = [1 if pred > 0.30 else 0 for pred in pred_ytest]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0.12506154603643527"
	]
	},
	"execution_count": 20,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"evaluation(predictions,y_test)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#Predict on the test set\n",
	"X_weight = [2 if data == 1 else 1 for data in y]\n",
	"xgtrain = xgb.DMatrix(X, label=y, weight= X_weight)\n",
	"xgtest = xgb.DMatrix(Final_test)\n",
	"model_full = xgb.train(plst, xgtrain, num_rounds)\n",
	"pred_Finaltest = model_full.predict(xgtest)\n",
	"predictions_final = ['Gold' if pred > 0.30 else 'Silver' for pred in pred_Finaltest]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"We save our data for ensembling with the other XGB model that we built. "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"to_ensemble = pd.DataFrame({ 'REF_NO': test['REF_NO'], 'Account.Status':pred_Finaltest})\n",
	"to_ensemble = to_ensemble[['REF_NO', 'Account.Status']]\n",
	"to_ensemble.to_csv(\"data/subXGB2.csv\", index = False)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.10"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}