Created
February 17, 2024 07:41
-
-
Save masudahiroto/8bd5ac8c5c467e8e3db64105b57d0080 to your computer and use it in GitHub Desktop.
XGBoost Training Speed: A Comparative Analysis.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"provenance": [], | |
"gpuType": "T4", | |
"authorship_tag": "ABX9TyNnl09giS4CjklNd2tnDAgK", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/masudahiroto/8bd5ac8c5c467e8e3db64105b57d0080/xgboost-training-speed-a-comparative-analysis.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"experiment_type = '1.7.6' #@param [\"1.7.6\", \"1.7.6+hist\", \"2.0.3\", \"2.0.3+GPU\"]" | |
], | |
"metadata": { | |
"id": "RBXjDga1YcxZ" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"if experiment_type == \"1.7.6\":\n", | |
" xgboost_version = \"1.7.6\"\n", | |
" tree_method = None\n", | |
" use_gpu = False\n", | |
"elif experiment_type == \"1.7.6+hist\":\n", | |
" xgboost_version = \"1.7.6\"\n", | |
" tree_method = \"hist\"\n", | |
" use_gpu = False\n", | |
"elif experiment_type == \"2.0.3\":\n", | |
" xgboost_version = \"2.0.3\"\n", | |
" tree_method = None\n", | |
" use_gpu = False\n", | |
"elif experiment_type == \"2.0.3+GPU\":\n", | |
" xgboost_version = \"2.0.3\"\n", | |
" tree_method = None\n", | |
" use_gpu = True\n", | |
"else:\n", | |
" raise ValueError()" | |
], | |
"metadata": { | |
"id": "MXOggu8kZ0sa" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Check whether GPU is avaliable\n", | |
"if use_gpu:\n", | |
" import tensorflow as tf\n", | |
" assert tf.test.is_gpu_available(), \"\"\"\\\n", | |
"GPU is not attached. You have to do the following steps:\n", | |
"\n", | |
"1. Open your Colab notebook.\n", | |
"2. Select \"Runtime\" from the menu.\n", | |
"3. Click on \"Change runtime type.\"\n", | |
"4. Expand the \"Hardware accelerator\" dropdown.\n", | |
"5. Change from \"None\" to \"GPU.\"\n", | |
"6. Click \"Save\" to apply the changes.\n", | |
"\"\"\"" | |
], | |
"metadata": { | |
"id": "T6m-u4g4a3Zz" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Install XGBoost for a specified version.\n", | |
"!pip install xgboost=={xgboost_version}" | |
], | |
"metadata": { | |
"id": "KBRW81p_XtjA" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"import xgboost as xgb\n", | |
"from sklearn.model_selection import train_test_split\n", | |
"\n", | |
"import time" | |
], | |
"metadata": { | |
"id": "faA65kwERBh6" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"assert xgb.__version__ == xgboost_version" | |
], | |
"metadata": { | |
"id": "rKoilyliRC5H" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "QZnW_2-qO4Wx" | |
}, | |
"outputs": [], | |
"source": [ | |
"def generate_dummy_data(num_records, num_features, num_categorical):\n", | |
" np.random.seed(42)\n", | |
"\n", | |
" # Numerical features\n", | |
" X_numeric = np.random.rand(num_records, num_features - num_categorical)\n", | |
"\n", | |
" # Categorical variables\n", | |
" categories = [f'cat_{i}' for i in range(num_categorical)]\n", | |
" X_categorical = np.random.choice([1, 2, 3], size=(num_records, num_categorical))\n", | |
" X_categorical_df = pd.DataFrame(X_categorical, columns=categories)\n", | |
"\n", | |
" # Concatenate dummy data\n", | |
" X = np.hstack([X_numeric, X_categorical_df])\n", | |
"\n", | |
" # Labels\n", | |
" y = np.random.randint(2, size=num_records)\n", | |
"\n", | |
" # Feature types: 'q' for quantitative (numeric), 'c' for categorical\n", | |
" ft = ['q'] * (num_features - num_categorical) + ['c'] * num_categorical\n", | |
" return X, y, ft\n", | |
"\n", | |
"def train_xgboost(X_train, y_train, X_test, y_test, params, ft, num_round=100):\n", | |
" dtrain = xgb.DMatrix(X_train, label=y_train, feature_types=ft, enable_categorical=True)\n", | |
" dtest = xgb.DMatrix(X_test, label=y_test, feature_types=ft, enable_categorical=True)\n", | |
"\n", | |
" if use_gpu:\n", | |
" # Add parameters to use gpu\n", | |
" params[\"device\"] = \"cuda\"\n", | |
" params[\"tree_method\"] = \"hist\"\n", | |
"\n", | |
" if tree_method:\n", | |
" params[\"tree_method\"] = tree_method\n", | |
"\n", | |
" if experiment_type == \"1.7.6\":\n", | |
" num_round = 30 # because this condition is super slow.\n", | |
" start = time.perf_counter()\n", | |
" xgb.train(params, dtrain, num_round, evals=[(dtest, \"validation\")], verbose_eval=1)\n", | |
" end = time.perf_counter()\n", | |
"\n", | |
" print(f'{(end - start) / num_round:.5f} seconds per iteration')\n", | |
" return" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"num_records = 1000000\n", | |
"num_features = 200\n", | |
"num_categorical = 10\n", | |
"\n", | |
"X, y, ft = generate_dummy_data(num_records, num_features, num_categorical)\n", | |
"\n", | |
"# データを訓練用とテスト用に分割\n", | |
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" | |
], | |
"metadata": { | |
"id": "nXUYksUxP5vp" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# XGBoostのパラメータ設定\n", | |
"xgboost_params = {\n", | |
" 'objective': 'binary:logistic',\n", | |
" 'eval_metric': 'auc',\n", | |
" 'eta': 0.3,\n", | |
" 'max_depth': 6,\n", | |
" 'subsample': 1.0,\n", | |
" 'colsample_bytree': 1.0,\n", | |
" 'alpha': 1,\n", | |
" 'lambda': 1,\n", | |
" 'nthread': 4,\n", | |
" 'random_state': 42,\n", | |
" 'silent': 1\n", | |
"}\n", | |
"\n", | |
"train_xgboost(X_train, y_train, X_test, y_test, xgboost_params, ft)" | |
], | |
"metadata": { | |
"id": "XHfIbQp4QHE3" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [], | |
"metadata": { | |
"id": "dOhLePb-fCNU" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment