VibhuJawa/xgboost_working_example.ipynb

## xgboost_working_example.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "env: NCCL_P2P_DISABLE=1\n"
     ]
    }
   ],
   "source": [
    "%env NCCL_P2P_DISABLE=1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import xgboost \n",
    "import dask_cudf\n",
    "from dask import delayed\n",
    "import dask_xgboost\n",
    "from dask.distributed import Client, wait\n",
    "from dask.dataframe import from_delayed\n",
    "import cudf\n",
    "import dask\n",
    "from dask_cuda import LocalCUDACluster"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_py-xgboost-mutex         2.0                       cpu_0    conda-forge\n",
      "dask-xgboost              0.2.0.dev28      cuda10.0py37_0    rapidsai/label/xgboost\n",
      "libxgboost                0.90                 he1b5a44_2    conda-forge\n",
      "py-xgboost                0.90             py37he1b5a44_2    conda-forge\n",
      "xgboost                   1.0.0-SNAPSHOT           pypi_0    pypi\n"
     ]
    }
   ],
   "source": [
    "!conda list | grep xgboost"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### xgboost source\n",
    "!xgboost is instaled from hhttps://xgboost-ci.net/job/xgboost/job/master/lastSuccessfulBuild/artifact/python-package/dist/xgboost-1.0.0_SNAPSHOT-py2.py3-none-any.whl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "cluster = LocalCUDACluster()\n",
    "client = Client(cluster)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = dask_cudf.from_cudf(cudf.DataFrame({'x':[1,2]*16,'y':[0,1]*16}), npartitions=8)\n",
    "df = df.persist()\n",
    "done = wait(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "xgb_gpu_params = {\n",
    "        'nround':            100,\n",
    "        'max_depth':         8,\n",
    "        'max_leaves':        2**8,\n",
    "        'alpha':             0.9,\n",
    "        'eta':               0.1,\n",
    "        'gamma':             0.1,\n",
    "        'learning_rate':     0.1,\n",
    "        'subsample':         1,\n",
    "        'reg_lambda':        1,\n",
    "        'scale_pos_weight':  2,\n",
    "        'min_child_weight':  30,\n",
    "        'tree_method':       'gpu_hist',\n",
    "        'loss':              'ls',\n",
    "        'objective':         'binary:logistic',\n",
    "        'max_features':      'auto',\n",
    "        'criterion':         'friedman_mse',\n",
    "        'grow_policy':       'lossguide',\n",
    "        'verbose':           True\n",
    "    }\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Training using cudf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<xgboost.core.Booster at 0x7f22248fceb8>"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = xgboost.train(xgb_gpu_params,xgboost.DMatrix(df[['x']].compute(),df[['y']].compute()))\n",
    "model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Training using dask-cudf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<xgboost.core.Booster at 0x7f212814b4e0>"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = dask_xgboost.train(client, xgb_gpu_params, df[['x']], df[['y']], \n",
    "                          num_boost_round=xgb_gpu_params['nround'])\n",
    "model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Training using dmatrix\n",
    "##### Dmatrix requires that you have 1 dmatrix  per GPU/Worker\n",
    "* Below i first repartion the df by concatinating frames on the same GPU\n",
    "* Then create DMatrix for training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def repartition_cudf_join_on_gpu(df,client):\n",
    "    \"\"\"\n",
    "        This function repartitions the dataframe by joining all \n",
    "        keys that are on the same gpu\n",
    "        This only works with persisted DF\n",
    "    \"\"\"\n",
    "    dataframe_dask_keys = [str(key) for key in df.__dask_keys__() ]\n",
    "    client_dict = client.has_what()\n",
    "    \n",
    "    ### this creates a key map\n",
    "    ## where key is worker_ip\n",
    "    ## and values are the dataframe objects at that key\n",
    "    worker_key_map = {}\n",
    "    for worker_ip,worker_tasks  in client_dict.items():\n",
    "        worker_key_map[worker_ip] = [delayed(lambda x:x)(dask_key_name = dask_key) for dask_key in worker_tasks if dask_key in dataframe_dask_keys]\n",
    "        \n",
    "    concatenated_df_task_ls = []\n",
    "    for list_delayed in worker_key_map.values():\n",
    "        concatenated_df_task_ls.append(delayed(cudf.concat)(list_delayed))\n",
    "    \n",
    "    ### persisting the tasks on GPU\n",
    "    concatenated_df_task_ls = [task.persist() for task in concatenated_df_task_ls]\n",
    "    done = wait(concatenated_df_task_ls)\n",
    "\n",
    "\n",
    "    return concatenated_df_task_ls\n",
    "\n",
    "\n",
    "def get_dmatrix_from_persisted_df(df,label_col = 'y',non_label_columns=['x']):\n",
    "    gpu_dfs = [(gpu_df[[label_col]], gpu_df[non_label_columns]) for gpu_df in df.to_delayed()]\n",
    "    gpu_dfs = [(gpu_df[0].persist(), gpu_df[1].persist()) for gpu_df in gpu_dfs]\n",
    "\n",
    "    dmat_ls = [dask.delayed(xgboost.DMatrix)(gpu_df[1], gpu_df[0]) for gpu_df in gpu_dfs]\n",
    "    dmat_ls = [dmat.persist() for dmat in dmat_ls]\n",
    "    wait(dmat_ls)\n",
    "    \n",
    "    return dmat_ls"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "repartitioned_df = dask_cudf.from_delayed(repartition_cudf_join_on_gpu(df,client))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "assert repartitioned_df.npartitions == len(client.scheduler_info()['workers'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "dmat = get_dmatrix_from_persisted_df(repartitioned_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<xgboost.core.Booster at 0x7f20faff2fd0>"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = dask_xgboost.train(client, xgb_gpu_params, dmat, None, num_boost_round=xgb_gpu_params['nround'])\n",
    "model"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"env: NCCL_P2P_DISABLE=1\n"
	]
	}
	],
	"source": [
	"%env NCCL_P2P_DISABLE=1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"import xgboost \n",
	"import dask_cudf\n",
	"from dask import delayed\n",
	"import dask_xgboost\n",
	"from dask.distributed import Client, wait\n",
	"from dask.dataframe import from_delayed\n",
	"import cudf\n",
	"import dask\n",
	"from dask_cuda import LocalCUDACluster"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"_py-xgboost-mutex 2.0 cpu_0 conda-forge\n",
	"dask-xgboost 0.2.0.dev28 cuda10.0py37_0 rapidsai/label/xgboost\n",
	"libxgboost 0.90 he1b5a44_2 conda-forge\n",
	"py-xgboost 0.90 py37he1b5a44_2 conda-forge\n",
	"xgboost 1.0.0-SNAPSHOT pypi_0 pypi\n"
	]
	}
	],
	"source": [
	"!conda list \| grep xgboost"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### xgboost source\n",
	"!xgboost is instaled from hhttps://xgboost-ci.net/job/xgboost/job/master/lastSuccessfulBuild/artifact/python-package/dist/xgboost-1.0.0_SNAPSHOT-py2.py3-none-any.whl"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"cluster = LocalCUDACluster()\n",
	"client = Client(cluster)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"df = dask_cudf.from_cudf(cudf.DataFrame({'x':[1,2]16,'y':[0,1]16}), npartitions=8)\n",
	"df = df.persist()\n",
	"done = wait(df)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"xgb_gpu_params = {\n",
	" 'nround': 100,\n",
	" 'max_depth': 8,\n",
	" 'max_leaves': 2**8,\n",
	" 'alpha': 0.9,\n",
	" 'eta': 0.1,\n",
	" 'gamma': 0.1,\n",
	" 'learning_rate': 0.1,\n",
	" 'subsample': 1,\n",
	" 'reg_lambda': 1,\n",
	" 'scale_pos_weight': 2,\n",
	" 'min_child_weight': 30,\n",
	" 'tree_method': 'gpu_hist',\n",
	" 'loss': 'ls',\n",
	" 'objective': 'binary:logistic',\n",
	" 'max_features': 'auto',\n",
	" 'criterion': 'friedman_mse',\n",
	" 'grow_policy': 'lossguide',\n",
	" 'verbose': True\n",
	" }\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Training using cudf"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"<xgboost.core.Booster at 0x7f22248fceb8>"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"model = xgboost.train(xgb_gpu_params,xgboost.DMatrix(df[['x']].compute(),df[['y']].compute()))\n",
	"model"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Training using dask-cudf"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"<xgboost.core.Booster at 0x7f212814b4e0>"
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"model = dask_xgboost.train(client, xgb_gpu_params, df[['x']], df[['y']], \n",
	" num_boost_round=xgb_gpu_params['nround'])\n",
	"model"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Training using dmatrix\n",
	"##### Dmatrix requires that you have 1 dmatrix per GPU/Worker\n",
	"* Below i first repartion the df by concatinating frames on the same GPU\n",
	"* Then create DMatrix for training"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [],
	"source": [
	"def repartition_cudf_join_on_gpu(df,client):\n",
	" \"\"\"\n",
	" This function repartitions the dataframe by joining all \n",
	" keys that are on the same gpu\n",
	" This only works with persisted DF\n",
	" \"\"\"\n",
	" dataframe_dask_keys = [str(key) for key in df.__dask_keys__() ]\n",
	" client_dict = client.has_what()\n",
	" \n",
	" ### this creates a key map\n",
	" ## where key is worker_ip\n",
	" ## and values are the dataframe objects at that key\n",
	" worker_key_map = {}\n",
	" for worker_ip,worker_tasks in client_dict.items():\n",
	" worker_key_map[worker_ip] = [delayed(lambda x:x)(dask_key_name = dask_key) for dask_key in worker_tasks if dask_key in dataframe_dask_keys]\n",
	" \n",
	" concatenated_df_task_ls = []\n",
	" for list_delayed in worker_key_map.values():\n",
	" concatenated_df_task_ls.append(delayed(cudf.concat)(list_delayed))\n",
	" \n",
	" ### persisting the tasks on GPU\n",
	" concatenated_df_task_ls = [task.persist() for task in concatenated_df_task_ls]\n",
	" done = wait(concatenated_df_task_ls)\n",
	"\n",
	"\n",
	" return concatenated_df_task_ls\n",
	"\n",
	"\n",
	"def get_dmatrix_from_persisted_df(df,label_col = 'y',non_label_columns=['x']):\n",
	" gpu_dfs = [(gpu_df[[label_col]], gpu_df[non_label_columns]) for gpu_df in df.to_delayed()]\n",
	" gpu_dfs = [(gpu_df[0].persist(), gpu_df[1].persist()) for gpu_df in gpu_dfs]\n",
	"\n",
	" dmat_ls = [dask.delayed(xgboost.DMatrix)(gpu_df[1], gpu_df[0]) for gpu_df in gpu_dfs]\n",
	" dmat_ls = [dmat.persist() for dmat in dmat_ls]\n",
	" wait(dmat_ls)\n",
	" \n",
	" return dmat_ls"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [],
	"source": [
	"repartitioned_df = dask_cudf.from_delayed(repartition_cudf_join_on_gpu(df,client))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [],
	"source": [
	"assert repartitioned_df.npartitions == len(client.scheduler_info()['workers'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [],
	"source": [
	"dmat = get_dmatrix_from_persisted_df(repartitioned_df)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"<xgboost.core.Booster at 0x7f20faff2fd0>"
	]
	},
	"execution_count": 13,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"model = dask_xgboost.train(client, xgb_gpu_params, dmat, None, num_boost_round=xgb_gpu_params['nround'])\n",
	"model"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}