Skip to content

Instantly share code, notes, and snippets.

@daxiongshu
Last active July 13, 2019 01:06
Show Gist options
  • Save daxiongshu/335b4a729b9006d4351346462f315c4a to your computer and use it in GitHub Desktop.
Save daxiongshu/335b4a729b9006d4351346462f315c4a to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(200000, 202)\n"
]
}
],
"source": [
"import cudf as gd\n",
"import pandas as pd\n",
"import time\n",
"import xgboost as xgb\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\")\n",
"\n",
"PATH = '../input'\n",
"cols = ['ID_code', 'target'] + ['var_%d'%i for i in range(200)]\n",
"dtypes = ['int32', 'int32'] + ['float32' for i in range(200)]\n",
"train_gd = gd.read_csv('%s/train.csv'%PATH,names=cols,dtype=dtypes,skiprows=1)\n",
"print(train_gd.shape)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 39 s, sys: 1min 54s, total: 2min 33s\n",
"Wall time: 3min 40s\n"
]
}
],
"source": [
"%%time\n",
"for i in range(200):\n",
" col = 'var_%d'%i\n",
" var_count = train_gd.groupby(col).agg({col:'count'})\n",
" var_count.columns = ['%s_count'%col]\n",
" var_count = var_count.reset_index()\n",
" train_gd = train_gd.merge(var_count,on=col,how='left')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0]\teval-auc:0.528313\ttrain-auc:0.527316\n",
"Multiple eval metrics have been passed: 'train-auc' will be used for early stopping.\n",
"\n",
"Will train until train-auc hasn't improved in 30 rounds.\n",
"[1000]\teval-auc:0.887032\ttrain-auc:0.885971\n",
"[2000]\teval-auc:0.901109\ttrain-auc:0.902143\n",
"[3000]\teval-auc:0.903539\ttrain-auc:0.907638\n",
"[4000]\teval-auc:0.904385\ttrain-auc:0.910537\n",
"[5000]\teval-auc:0.904599\ttrain-auc:0.912562\n",
"[6000]\teval-auc:0.904142\ttrain-auc:0.914191\n",
"[7000]\teval-auc:0.904253\ttrain-auc:0.91562\n",
"[8000]\teval-auc:0.9038\ttrain-auc:0.916824\n",
"[9000]\teval-auc:0.903308\ttrain-auc:0.917899\n",
"[9999]\teval-auc:0.903325\ttrain-auc:0.918851\n",
"CPU times: user 3h 6min 8s, sys: 10min 35s, total: 3h 16min 44s\n",
"Wall time: 2min 40s\n"
]
}
],
"source": [
"%%time\n",
"train,valid = train_gd[:-10000],train_gd[-10000:]\n",
"x_train = train.drop(['target','ID_code'])\n",
"y_train = train['target']\n",
"x_valid = valid.drop(['target','ID_code'])\n",
"y_valid = valid['target']\n",
"\n",
"xgb_params = {\n",
" 'objective': 'binary:logistic',\n",
" 'tree_method': 'gpu_hist',\n",
" 'max_depth': 1, \n",
" 'eta':0.1,\n",
" 'silent':1,\n",
" 'subsample':0.5,\n",
" 'colsample_bytree': 0.05, \n",
" 'eval_metric':'auc',\n",
"}\n",
"dtrain = xgb.DMatrix(data=x_train.to_pandas(), label=y_train.to_pandas())\n",
"dvalid = xgb.DMatrix(data=x_valid.to_pandas(), label=y_valid.to_pandas())\n",
"watchlist = [(dvalid, 'eval'), (dtrain, 'train')]\n",
"clf = xgb.train(xgb_params, dtrain=dtrain,\n",
" num_boost_round=10000,evals=watchlist,\n",
" early_stopping_rounds=30,maximize=True,\n",
" verbose_eval=1000)\n",
"yp = clf.predict(dvalid)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment