Skip to content

Instantly share code, notes, and snippets.

@pronojitsaha
Created October 18, 2015 10:26
Show Gist options
  • Save pronojitsaha/fbb73f94ca7a4223c9e0 to your computer and use it in GitHub Desktop.
Save pronojitsaha/fbb73f94ca7a4223c9e0 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# suppress pandas warnings\n",
"import warnings\n",
"warnings.simplefilter(action = \"ignore\", category = RuntimeWarning)\n",
"warnings.simplefilter(action = \"ignore\", category = FutureWarning)\n",
"\n",
"# imports\n",
"import sys\n",
"import xgboost as xgb\n",
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn import preprocessing\n",
"import pandas as pd\n",
"from numpy.random import seed\n",
"from sklearn.cross_validation import StratifiedShuffleSplit\n",
"\n",
"# reproduce results\n",
"seed(786)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"((8123, 32), (2032, 31))"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Read the files\n",
"train = pd.read_csv('data/train_K9K1f9B.csv')\n",
"test = pd.read_csv('data/test_yIjzS7t.csv')\n",
"train.shape, test.shape"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"unknown_cols = ['status', 'occupation', 'occupation_partner', 'region']\n",
"for col in unknown_cols:\n",
" train.ix[train[col] == 'Unknown',col] = float('nan')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"I treated the unknown values in different features as missing value as well. "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#For test set\n",
"for col in unknown_cols:\n",
" test.ix[test[col] == 'Unknown',col] = float('nan')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Missing values was not given any special treatment as XGB has its own way of treating them, so we leave it to the algorithm to decide the best. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The feature 'post_area' had lot of levels, so treated the less frequent areas as one (i.e. others). "
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"pa_counts = train.post_area.value_counts(dropna=True)\n",
"pa_counts_rare = list(pa_counts[pa_counts<5].index)\n",
"train.ix[train['post_area'].isin(pa_counts_rare), \"post_area\"] = \"Others\"\n",
"test.ix[test['post_area'].isin(pa_counts_rare), \"post_area\"] = \"Others\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For test set, we take the new values of 'post_area' which are not in train set as 'Others' too. "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"newpostareas = list(set(test['post_area']) - set(train['post_area']))\n",
"test.ix[test['post_area'].isin(newpostareas), \"post_area\"] = \"Others\""
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data = train.copy()\n",
"label = data['Account.Status'].map({'Silver': 0, 'Gold': 1})\n",
"\n",
"#I dropped the feature 'post_code' since its highest frequency was 2 and hence would not add any value to the model.\n",
"dropCols = ['REF_NO', 'Account.Status', 'post_code']\n",
"data.drop(dropCols, axis=1, inplace = True)\n",
"\n",
"y = label\n",
"X = pd.get_dummies(data) #converted the categorical features into 2 level factor variables. "
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#For test\n",
"test2 = test.copy()\n",
"testdropcols = list(set(dropCols)-set(['Account.Status']))\n",
"test2 = test.drop(testdropcols, axis=1)\n",
"\n",
"Final_test = pd.get_dummies(test2)\n",
"missingCols = list(set(X.columns)-set(Final_test.columns))\n",
"for col in missingCols:\n",
" Final_test[col] = 0\n",
"Final_test = Final_test[X.columns]\n",
"assert X.columns.equals(Final_test.columns)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 7255\n",
"1 868\n",
"dtype: int64"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"label.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"StratifiedShuffleSplit(labels=[0 0 0 ..., 0 0 0], n_iter=1, test_size=0.25, random_state=0)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Creating a hold out set using stratified sampling as our target variable is skewed\n",
"holdout_fold = StratifiedShuffleSplit(y, n_iter=1, test_size=0.25, random_state=0)\n",
"holdout_fold"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"for train_index, holdout_index in holdout_fold:\n",
" X_train = X.ix[train_index]\n",
" X_test = X.ix[holdout_index]\n",
" y_train = y[train_index]\n",
" y_test = y[holdout_index]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#Define the evaluation function\n",
"def evaluation(y_predicted, y_true):\n",
" Ns = sum([1 if (a == 0 and b == 1) else 0 for (a,b) in zip(y_predicted,y_true)])\n",
" Ng = sum([1 if (a == 1 and b == 0) else 0 for (a,b) in zip(y_predicted,y_true)])\n",
" T = len(y_true)\n",
" M = (8*Ns + 2*Ng)/float(T)\n",
" return M"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#finding the ideal learning rate and num_rounds\n",
"params = {}\n",
"params[\"objective\"] = \"binary:logistic\"\n",
"params[\"max_depth\"] = 10\n",
"params[\"eta\"] = 0.01 #higher is more conservative [0,1], if reduced then increase num_rounds\n",
"params[\"eval_metric \"] = 'logloss'\n",
"params[\"seed\"] = 0\n",
"params[\"silent\"] = 1\n",
"plst = list(params.items())\n",
"num_rounds = 10000\n",
"\n",
"xgtrain = xgb.DMatrix(X_train, label=y_train) #weight= trainX_mobN_weight\n",
"#xgb.cv(params, xgtrain, num_rounds, nfold=4, metrics={'logloss'})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"xgb.cv gives the following output"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"[1074]\tcv-test-logloss:0.097280+0.003721\tcv-train-logloss:0.028058+0.001354 \n",
"[1075]\tcv-test-logloss:0.097280+0.003718\tcv-train-logloss:0.028017+0.001332 \n",
"[1076]\tcv-test-logloss:0.097291+0.003725\tcv-train-logloss:0.027987+0.001338 \n",
"[1077]\tcv-test-logloss:0.097279+0.003724\tcv-train-logloss:0.027958+0.001339 \n",
"[1078]\tcv-test-logloss:0.097273+0.003725\tcv-train-logloss:0.027924+0.001347 "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"As one can see from above the overfitting starts at around tree number 1077."
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#give higher weights to the labels 1 as the target variable is skewed\n",
"Xtrain_weight = [2 if data == 1 else 1 for data in y_train]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#Having determined the ideal learning rate and num_rounds, we tune the other parameters\n",
"params = {}\n",
"params[\"objective\"] = \"binary:logistic\"\n",
"#To avoid overfitting: The first way is to directly control model complexity\n",
"params[\"min_child_weight\"] = 3 #The larger, the more conservative the algorithm will be.\n",
"params[\"max_depth\"] = 10\n",
"#params[\"gamma\"] = 0 #The larger, the more conservative the algorithm will be.\n",
"params[\"eta\"] = 0.01 #higher is more conservative [0,1], if reduced then increase num_rounds\n",
"#The second way is to add randomness to make training robust to noise\n",
"params[\"subsample\"] = 0.9\n",
"params[\"colsample_bytree\"] = 0.9\n",
"\n",
"#Handle Imbalanced Dataset\n",
"#If you care only about the ranking order (AUC) of your prediction\n",
"#params[\"scale_pos_weight\"] = 1 #ratio of labels in target variable\n",
"params[\"eval_metric \"] = 'logloss'\n",
"#If you care about predicting the right probability\n",
"params[\"max_delta_step\"]= 8 #should be high for skewed data\n",
"\n",
"params[\"seed\"] = 0\n",
"params[\"silent\"] = 1\n",
"params[\"nthread\"] = 4\n",
"plst = list(params.items())\n",
"num_rounds = 1100"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"xgtrain = xgb.DMatrix(X_train, label=y_train, weight= Xtrain_weight)\n",
"xgtest = xgb.DMatrix(X_test)\n",
"model = xgb.train(plst, xgtrain, num_rounds)\n",
"pred_ytest = model.predict(xgtest)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In our evaluation function we have a hgher penalty for misclassification as silver i.e. 0. So it will be better for our model to predict more number of gold i.e. 1 and go wrong on them than predict more number of silver and go wrong on them. So we chose a low sensitivity of our model for the prediction (by trial and error), in our case it is 0.30. "
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"predictions = [1 if pred > 0.30 else 0 for pred in pred_ytest]"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.12506154603643527"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"evaluation(predictions,y_test)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#Predict on the test set\n",
"X_weight = [2 if data == 1 else 1 for data in y]\n",
"xgtrain = xgb.DMatrix(X, label=y, weight= X_weight)\n",
"xgtest = xgb.DMatrix(Final_test)\n",
"model_full = xgb.train(plst, xgtrain, num_rounds)\n",
"pred_Finaltest = model_full.predict(xgtest)\n",
"predictions_final = ['Gold' if pred > 0.30 else 'Silver' for pred in pred_Finaltest]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We save our data for ensembling with the other XGB model that we built. "
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"to_ensemble = pd.DataFrame({ 'REF_NO': test['REF_NO'], 'Account.Status':pred_Finaltest})\n",
"to_ensemble = to_ensemble[['REF_NO', 'Account.Status']]\n",
"to_ensemble.to_csv(\"data/subXGB2.csv\", index = False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment