masudahiroto/xgboost-training-speed-a-comparative-analysis.ipynb Secret

## xgboost-training-speed-a-comparative-analysis.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "gpuType": "T4",
      "authorship_tag": "ABX9TyNnl09giS4CjklNd2tnDAgK",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/masudahiroto/8bd5ac8c5c467e8e3db64105b57d0080/xgboost-training-speed-a-comparative-analysis.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "experiment_type = '1.7.6' #@param [\"1.7.6\", \"1.7.6+hist\", \"2.0.3\", \"2.0.3+GPU\"]"
      ],
      "metadata": {
        "id": "RBXjDga1YcxZ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "if experiment_type == \"1.7.6\":\n",
        "    xgboost_version = \"1.7.6\"\n",
        "    tree_method = None\n",
        "    use_gpu = False\n",
        "elif experiment_type == \"1.7.6+hist\":\n",
        "    xgboost_version = \"1.7.6\"\n",
        "    tree_method = \"hist\"\n",
        "    use_gpu = False\n",
        "elif experiment_type == \"2.0.3\":\n",
        "    xgboost_version = \"2.0.3\"\n",
        "    tree_method = None\n",
        "    use_gpu = False\n",
        "elif experiment_type == \"2.0.3+GPU\":\n",
        "    xgboost_version = \"2.0.3\"\n",
        "    tree_method = None\n",
        "    use_gpu = True\n",
        "else:\n",
        "    raise ValueError()"
      ],
      "metadata": {
        "id": "MXOggu8kZ0sa"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Check whether GPU is avaliable\n",
        "if use_gpu:\n",
        "    import tensorflow as tf\n",
        "    assert tf.test.is_gpu_available(), \"\"\"\\\n",
        "GPU is not attached. You have to do the following steps:\n",
        "\n",
        "1. Open your Colab notebook.\n",
        "2. Select \"Runtime\" from the menu.\n",
        "3. Click on \"Change runtime type.\"\n",
        "4. Expand the \"Hardware accelerator\" dropdown.\n",
        "5. Change from \"None\" to \"GPU.\"\n",
        "6. Click \"Save\" to apply the changes.\n",
        "\"\"\""
      ],
      "metadata": {
        "id": "T6m-u4g4a3Zz"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Install XGBoost for a specified version.\n",
        "!pip install xgboost=={xgboost_version}"
      ],
      "metadata": {
        "id": "KBRW81p_XtjA"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import numpy as np\n",
        "import pandas as pd\n",
        "import xgboost as xgb\n",
        "from sklearn.model_selection import train_test_split\n",
        "\n",
        "import time"
      ],
      "metadata": {
        "id": "faA65kwERBh6"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "assert xgb.__version__ == xgboost_version"
      ],
      "metadata": {
        "id": "rKoilyliRC5H"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "QZnW_2-qO4Wx"
      },
      "outputs": [],
      "source": [
        "def generate_dummy_data(num_records, num_features, num_categorical):\n",
        "    np.random.seed(42)\n",
        "\n",
        "    # Numerical features\n",
        "    X_numeric = np.random.rand(num_records, num_features - num_categorical)\n",
        "\n",
        "    # Categorical variables\n",
        "    categories = [f'cat_{i}' for i in range(num_categorical)]\n",
        "    X_categorical = np.random.choice([1, 2, 3], size=(num_records, num_categorical))\n",
        "    X_categorical_df = pd.DataFrame(X_categorical, columns=categories)\n",
        "\n",
        "    # Concatenate dummy data\n",
        "    X = np.hstack([X_numeric, X_categorical_df])\n",
        "\n",
        "    # Labels\n",
        "    y = np.random.randint(2, size=num_records)\n",
        "\n",
        "    # Feature types: 'q' for quantitative (numeric), 'c' for categorical\n",
        "    ft = ['q'] * (num_features - num_categorical) + ['c'] * num_categorical\n",
        "    return X, y, ft\n",
        "\n",
        "def train_xgboost(X_train, y_train, X_test, y_test, params, ft, num_round=100):\n",
        "    dtrain = xgb.DMatrix(X_train, label=y_train, feature_types=ft, enable_categorical=True)\n",
        "    dtest = xgb.DMatrix(X_test, label=y_test, feature_types=ft, enable_categorical=True)\n",
        "\n",
        "    if use_gpu:\n",
        "        # Add parameters to use gpu\n",
        "        params[\"device\"] = \"cuda\"\n",
        "        params[\"tree_method\"] = \"hist\"\n",
        "\n",
        "    if tree_method:\n",
        "        params[\"tree_method\"] = tree_method\n",
        "\n",
        "    if experiment_type == \"1.7.6\":\n",
        "        num_round = 30  # because this condition is super slow.\n",
        "    start = time.perf_counter()\n",
        "    xgb.train(params, dtrain, num_round, evals=[(dtest, \"validation\")], verbose_eval=1)\n",
        "    end = time.perf_counter()\n",
        "\n",
        "    print(f'{(end - start) / num_round:.5f} seconds per iteration')\n",
        "    return"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "num_records = 1000000\n",
        "num_features = 200\n",
        "num_categorical = 10\n",
        "\n",
        "X, y, ft = generate_dummy_data(num_records, num_features, num_categorical)\n",
        "\n",
        "# データを訓練用とテスト用に分割\n",
        "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
      ],
      "metadata": {
        "id": "nXUYksUxP5vp"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# XGBoostのパラメータ設定\n",
        "xgboost_params = {\n",
        "    'objective': 'binary:logistic',\n",
        "    'eval_metric': 'auc',\n",
        "    'eta': 0.3,\n",
        "    'max_depth': 6,\n",
        "    'subsample': 1.0,\n",
        "    'colsample_bytree': 1.0,\n",
        "    'alpha': 1,\n",
        "    'lambda': 1,\n",
        "    'nthread': 4,\n",
        "    'random_state': 42,\n",
        "    'silent': 1\n",
        "}\n",
        "\n",
        "train_xgboost(X_train, y_train, X_test, y_test, xgboost_params, ft)"
      ],
      "metadata": {
        "id": "XHfIbQp4QHE3"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "dOhLePb-fCNU"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"provenance": [],
	"gpuType": "T4",
	"authorship_tag": "ABX9TyNnl09giS4CjklNd2tnDAgK",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/masudahiroto/8bd5ac8c5c467e8e3db64105b57d0080/xgboost-training-speed-a-comparative-analysis.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "code",
	"source": [
	"experiment_type = '1.7.6' #@param [\"1.7.6\", \"1.7.6+hist\", \"2.0.3\", \"2.0.3+GPU\"]"
	],
	"metadata": {
	"id": "RBXjDga1YcxZ"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"if experiment_type == \"1.7.6\":\n",
	" xgboost_version = \"1.7.6\"\n",
	" tree_method = None\n",
	" use_gpu = False\n",
	"elif experiment_type == \"1.7.6+hist\":\n",
	" xgboost_version = \"1.7.6\"\n",
	" tree_method = \"hist\"\n",
	" use_gpu = False\n",
	"elif experiment_type == \"2.0.3\":\n",
	" xgboost_version = \"2.0.3\"\n",
	" tree_method = None\n",
	" use_gpu = False\n",
	"elif experiment_type == \"2.0.3+GPU\":\n",
	" xgboost_version = \"2.0.3\"\n",
	" tree_method = None\n",
	" use_gpu = True\n",
	"else:\n",
	" raise ValueError()"
	],
	"metadata": {
	"id": "MXOggu8kZ0sa"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# Check whether GPU is avaliable\n",
	"if use_gpu:\n",
	" import tensorflow as tf\n",
	" assert tf.test.is_gpu_available(), \"\"\"\\\n",
	"GPU is not attached. You have to do the following steps:\n",
	"\n",
	"1. Open your Colab notebook.\n",
	"2. Select \"Runtime\" from the menu.\n",
	"3. Click on \"Change runtime type.\"\n",
	"4. Expand the \"Hardware accelerator\" dropdown.\n",
	"5. Change from \"None\" to \"GPU.\"\n",
	"6. Click \"Save\" to apply the changes.\n",
	"\"\"\""
	],
	"metadata": {
	"id": "T6m-u4g4a3Zz"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# Install XGBoost for a specified version.\n",
	"!pip install xgboost=={xgboost_version}"
	],
	"metadata": {
	"id": "KBRW81p_XtjA"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"import numpy as np\n",
	"import pandas as pd\n",
	"import xgboost as xgb\n",
	"from sklearn.model_selection import train_test_split\n",
	"\n",
	"import time"
	],
	"metadata": {
	"id": "faA65kwERBh6"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"assert xgb.__version__ == xgboost_version"
	],
	"metadata": {
	"id": "rKoilyliRC5H"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "QZnW_2-qO4Wx"
	},
	"outputs": [],
	"source": [
	"def generate_dummy_data(num_records, num_features, num_categorical):\n",
	" np.random.seed(42)\n",
	"\n",
	" # Numerical features\n",
	" X_numeric = np.random.rand(num_records, num_features - num_categorical)\n",
	"\n",
	" # Categorical variables\n",
	" categories = [f'cat_{i}' for i in range(num_categorical)]\n",
	" X_categorical = np.random.choice([1, 2, 3], size=(num_records, num_categorical))\n",
	" X_categorical_df = pd.DataFrame(X_categorical, columns=categories)\n",
	"\n",
	" # Concatenate dummy data\n",
	" X = np.hstack([X_numeric, X_categorical_df])\n",
	"\n",
	" # Labels\n",
	" y = np.random.randint(2, size=num_records)\n",
	"\n",
	" # Feature types: 'q' for quantitative (numeric), 'c' for categorical\n",
	" ft = ['q'] * (num_features - num_categorical) + ['c'] * num_categorical\n",
	" return X, y, ft\n",
	"\n",
	"def train_xgboost(X_train, y_train, X_test, y_test, params, ft, num_round=100):\n",
	" dtrain = xgb.DMatrix(X_train, label=y_train, feature_types=ft, enable_categorical=True)\n",
	" dtest = xgb.DMatrix(X_test, label=y_test, feature_types=ft, enable_categorical=True)\n",
	"\n",
	" if use_gpu:\n",
	" # Add parameters to use gpu\n",
	" params[\"device\"] = \"cuda\"\n",
	" params[\"tree_method\"] = \"hist\"\n",
	"\n",
	" if tree_method:\n",
	" params[\"tree_method\"] = tree_method\n",
	"\n",
	" if experiment_type == \"1.7.6\":\n",
	" num_round = 30 # because this condition is super slow.\n",
	" start = time.perf_counter()\n",
	" xgb.train(params, dtrain, num_round, evals=[(dtest, \"validation\")], verbose_eval=1)\n",
	" end = time.perf_counter()\n",
	"\n",
	" print(f'{(end - start) / num_round:.5f} seconds per iteration')\n",
	" return"
	]
	},
	{
	"cell_type": "code",
	"source": [
	"num_records = 1000000\n",
	"num_features = 200\n",
	"num_categorical = 10\n",
	"\n",
	"X, y, ft = generate_dummy_data(num_records, num_features, num_categorical)\n",
	"\n",
	"# データを訓練用とテスト用に分割\n",
	"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
	],
	"metadata": {
	"id": "nXUYksUxP5vp"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# XGBoostのパラメータ設定\n",
	"xgboost_params = {\n",
	" 'objective': 'binary:logistic',\n",
	" 'eval_metric': 'auc',\n",
	" 'eta': 0.3,\n",
	" 'max_depth': 6,\n",
	" 'subsample': 1.0,\n",
	" 'colsample_bytree': 1.0,\n",
	" 'alpha': 1,\n",
	" 'lambda': 1,\n",
	" 'nthread': 4,\n",
	" 'random_state': 42,\n",
	" 'silent': 1\n",
	"}\n",
	"\n",
	"train_xgboost(X_train, y_train, X_test, y_test, xgboost_params, ft)"
	],
	"metadata": {
	"id": "XHfIbQp4QHE3"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [],
	"metadata": {
	"id": "dOhLePb-fCNU"
	},
	"execution_count": null,
	"outputs": []
	}
	]
	}