Skip to content

Instantly share code, notes, and snippets.

@masudahiroto
Created February 17, 2024 07:41
Show Gist options
  • Save masudahiroto/8bd5ac8c5c467e8e3db64105b57d0080 to your computer and use it in GitHub Desktop.
Save masudahiroto/8bd5ac8c5c467e8e3db64105b57d0080 to your computer and use it in GitHub Desktop.
XGBoost Training Speed: A Comparative Analysis.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"gpuType": "T4",
"authorship_tag": "ABX9TyNnl09giS4CjklNd2tnDAgK",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/masudahiroto/8bd5ac8c5c467e8e3db64105b57d0080/xgboost-training-speed-a-comparative-analysis.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"source": [
"experiment_type = '1.7.6' #@param [\"1.7.6\", \"1.7.6+hist\", \"2.0.3\", \"2.0.3+GPU\"]"
],
"metadata": {
"id": "RBXjDga1YcxZ"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"if experiment_type == \"1.7.6\":\n",
" xgboost_version = \"1.7.6\"\n",
" tree_method = None\n",
" use_gpu = False\n",
"elif experiment_type == \"1.7.6+hist\":\n",
" xgboost_version = \"1.7.6\"\n",
" tree_method = \"hist\"\n",
" use_gpu = False\n",
"elif experiment_type == \"2.0.3\":\n",
" xgboost_version = \"2.0.3\"\n",
" tree_method = None\n",
" use_gpu = False\n",
"elif experiment_type == \"2.0.3+GPU\":\n",
" xgboost_version = \"2.0.3\"\n",
" tree_method = None\n",
" use_gpu = True\n",
"else:\n",
" raise ValueError()"
],
"metadata": {
"id": "MXOggu8kZ0sa"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Check whether GPU is avaliable\n",
"if use_gpu:\n",
" import tensorflow as tf\n",
" assert tf.test.is_gpu_available(), \"\"\"\\\n",
"GPU is not attached. You have to do the following steps:\n",
"\n",
"1. Open your Colab notebook.\n",
"2. Select \"Runtime\" from the menu.\n",
"3. Click on \"Change runtime type.\"\n",
"4. Expand the \"Hardware accelerator\" dropdown.\n",
"5. Change from \"None\" to \"GPU.\"\n",
"6. Click \"Save\" to apply the changes.\n",
"\"\"\""
],
"metadata": {
"id": "T6m-u4g4a3Zz"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Install XGBoost for a specified version.\n",
"!pip install xgboost=={xgboost_version}"
],
"metadata": {
"id": "KBRW81p_XtjA"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import xgboost as xgb\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"import time"
],
"metadata": {
"id": "faA65kwERBh6"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"assert xgb.__version__ == xgboost_version"
],
"metadata": {
"id": "rKoilyliRC5H"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "QZnW_2-qO4Wx"
},
"outputs": [],
"source": [
"def generate_dummy_data(num_records, num_features, num_categorical):\n",
" np.random.seed(42)\n",
"\n",
" # Numerical features\n",
" X_numeric = np.random.rand(num_records, num_features - num_categorical)\n",
"\n",
" # Categorical variables\n",
" categories = [f'cat_{i}' for i in range(num_categorical)]\n",
" X_categorical = np.random.choice([1, 2, 3], size=(num_records, num_categorical))\n",
" X_categorical_df = pd.DataFrame(X_categorical, columns=categories)\n",
"\n",
" # Concatenate dummy data\n",
" X = np.hstack([X_numeric, X_categorical_df])\n",
"\n",
" # Labels\n",
" y = np.random.randint(2, size=num_records)\n",
"\n",
" # Feature types: 'q' for quantitative (numeric), 'c' for categorical\n",
" ft = ['q'] * (num_features - num_categorical) + ['c'] * num_categorical\n",
" return X, y, ft\n",
"\n",
"def train_xgboost(X_train, y_train, X_test, y_test, params, ft, num_round=100):\n",
" dtrain = xgb.DMatrix(X_train, label=y_train, feature_types=ft, enable_categorical=True)\n",
" dtest = xgb.DMatrix(X_test, label=y_test, feature_types=ft, enable_categorical=True)\n",
"\n",
" if use_gpu:\n",
" # Add parameters to use gpu\n",
" params[\"device\"] = \"cuda\"\n",
" params[\"tree_method\"] = \"hist\"\n",
"\n",
" if tree_method:\n",
" params[\"tree_method\"] = tree_method\n",
"\n",
" if experiment_type == \"1.7.6\":\n",
" num_round = 30 # because this condition is super slow.\n",
" start = time.perf_counter()\n",
" xgb.train(params, dtrain, num_round, evals=[(dtest, \"validation\")], verbose_eval=1)\n",
" end = time.perf_counter()\n",
"\n",
" print(f'{(end - start) / num_round:.5f} seconds per iteration')\n",
" return"
]
},
{
"cell_type": "code",
"source": [
"num_records = 1000000\n",
"num_features = 200\n",
"num_categorical = 10\n",
"\n",
"X, y, ft = generate_dummy_data(num_records, num_features, num_categorical)\n",
"\n",
"# データを訓練用とテスト用に分割\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
],
"metadata": {
"id": "nXUYksUxP5vp"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# XGBoostのパラメータ設定\n",
"xgboost_params = {\n",
" 'objective': 'binary:logistic',\n",
" 'eval_metric': 'auc',\n",
" 'eta': 0.3,\n",
" 'max_depth': 6,\n",
" 'subsample': 1.0,\n",
" 'colsample_bytree': 1.0,\n",
" 'alpha': 1,\n",
" 'lambda': 1,\n",
" 'nthread': 4,\n",
" 'random_state': 42,\n",
" 'silent': 1\n",
"}\n",
"\n",
"train_xgboost(X_train, y_train, X_test, y_test, xgboost_params, ft)"
],
"metadata": {
"id": "XHfIbQp4QHE3"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "dOhLePb-fCNU"
},
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment