Skip to content

Instantly share code, notes, and snippets.

@daxiongshu
Created June 10, 2019 03:47
Show Gist options
  • Save daxiongshu/01c12cf5e502be3d54b87cda1cf2d14f to your computer and use it in GitHub Desktop.
Save daxiongshu/01c12cf5e502be3d54b87cda1cf2d14f to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(200000, 202)\n"
]
}
],
"source": [
"import cudf as gd\n",
"import pandas as pd\n",
"import time\n",
"import xgboost as xgb\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\")\n",
"\n",
"PATH = '../input'\n",
"cols = ['ID_code', 'target'] + ['var_%d'%i for i in range(200)]\n",
"dtypes = ['int32', 'int32'] + ['float32' for i in range(200)]\n",
"train_gd = gd.read_csv('%s/train.csv'%PATH,names=cols,dtype=dtypes,skiprows=1)\n",
"print(train_gd.shape)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 39.6 s, sys: 1min 55s, total: 2min 34s\n",
"Wall time: 3min 41s\n"
]
}
],
"source": [
"%%time\n",
"for i in range(200):\n",
" col = 'var_%d'%i\n",
" var_count = train_gd.groupby(col).agg({col:'count'})\n",
" var_count.columns = ['%s_count'%col]\n",
" var_count = var_count.reset_index()\n",
" train_gd = train_gd.merge(var_count,on=col,how='left')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1min 42s, sys: 3min 25s, total: 5min 7s\n",
"Wall time: 5min 41s\n"
]
}
],
"source": [
"%%time\n",
"for i in range(200):\n",
" col = 'var_%d'%i\n",
" dg = train_gd[[col,'%s_count'%col,'ID_code']].query(\"%s_count > 1\"%col)\n",
" dg.columns = ['%s_gt1'%col,'%s_count'%col,'ID_code']\n",
" train_gd = train_gd.merge(dg[['ID_code','%s_gt1'%col]],on='ID_code',how='left')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"train,valid = train_gd[:-10000],train_gd[-10000:]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"x_train = train.drop(['target','ID_code'])\n",
"y_train = train['target']\n",
"x_valid = valid.drop(['target','ID_code'])\n",
"y_valid = valid['target']"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1min 24s, sys: 12.2 s, total: 1min 36s\n",
"Wall time: 7.17 s\n"
]
}
],
"source": [
"%%time\n",
"xgb_params = {\n",
" 'objective': 'binary:logistic',\n",
" 'tree_method': 'gpu_hist',\n",
" 'max_depth': 1, \n",
" 'eta':0.1,\n",
" 'silent':1,\n",
" 'subsample':0.5,\n",
" 'colsample_bytree': 0.05, \n",
" 'eval_metric':'auc',\n",
"}\n",
"dtrain = xgb.DMatrix(data=x_train.to_pandas(), label=y_train.to_pandas())\n",
"dvalid = xgb.DMatrix(data=x_valid.to_pandas(), label=y_valid.to_pandas())"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0]\teval-auc:0.525513\ttrain-auc:0.530052\n",
"Multiple eval metrics have been passed: 'train-auc' will be used for early stopping.\n",
"\n",
"Will train until train-auc hasn't improved in 30 rounds.\n",
"[1000]\teval-auc:0.891935\ttrain-auc:0.897651\n",
"[2000]\teval-auc:0.908812\ttrain-auc:0.915395\n",
"[3000]\teval-auc:0.915829\ttrain-auc:0.921903\n",
"[4000]\teval-auc:0.918794\ttrain-auc:0.925332\n",
"[5000]\teval-auc:0.91931\ttrain-auc:0.927491\n",
"[6000]\teval-auc:0.92052\ttrain-auc:0.929105\n",
"[7000]\teval-auc:0.91998\ttrain-auc:0.93053\n",
"[8000]\teval-auc:0.919491\ttrain-auc:0.931805\n",
"[9000]\teval-auc:0.919273\ttrain-auc:0.93297\n",
"[9999]\teval-auc:0.919236\ttrain-auc:0.933955\n"
]
}
],
"source": [
"watchlist = [(dvalid, 'eval'), (dtrain, 'train')]\n",
"clf = xgb.train(xgb_params, dtrain=dtrain,\n",
" num_boost_round=10000,evals=watchlist,\n",
" early_stopping_rounds=30,maximize=True,\n",
" verbose_eval=1000)\n",
"yp = clf.predict(dvalid)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"print('plot top10 important features')\n",
"xgb.plot_importance(clf,max_num_features=10)\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment