Skip to content

Instantly share code, notes, and snippets.

@VibhuJawa
Last active February 14, 2020 00:19
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save VibhuJawa/4f6a73458a54dd7571e799f4c8a795cb to your computer and use it in GitHub Desktop.
Save VibhuJawa/4f6a73458a54dd7571e799f4c8a795cb to your computer and use it in GitHub Desktop.
xgboost_working_example.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"env: NCCL_P2P_DISABLE=1\n"
]
}
],
"source": [
"%env NCCL_P2P_DISABLE=1"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import xgboost \n",
"import dask_cudf\n",
"from dask import delayed\n",
"import dask_xgboost\n",
"from dask.distributed import Client, wait\n",
"from dask.dataframe import from_delayed\n",
"import cudf\n",
"import dask\n",
"from dask_cuda import LocalCUDACluster"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"_py-xgboost-mutex 2.0 cpu_0 conda-forge\n",
"dask-xgboost 0.2.0.dev28 cuda10.0py37_0 rapidsai/label/xgboost\n",
"libxgboost 0.90 he1b5a44_2 conda-forge\n",
"py-xgboost 0.90 py37he1b5a44_2 conda-forge\n",
"xgboost 1.0.0-SNAPSHOT pypi_0 pypi\n"
]
}
],
"source": [
"!conda list | grep xgboost"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### xgboost source\n",
"!xgboost is instaled from hhttps://xgboost-ci.net/job/xgboost/job/master/lastSuccessfulBuild/artifact/python-package/dist/xgboost-1.0.0_SNAPSHOT-py2.py3-none-any.whl"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"cluster = LocalCUDACluster()\n",
"client = Client(cluster)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"df = dask_cudf.from_cudf(cudf.DataFrame({'x':[1,2]*16,'y':[0,1]*16}), npartitions=8)\n",
"df = df.persist()\n",
"done = wait(df)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"xgb_gpu_params = {\n",
" 'nround': 100,\n",
" 'max_depth': 8,\n",
" 'max_leaves': 2**8,\n",
" 'alpha': 0.9,\n",
" 'eta': 0.1,\n",
" 'gamma': 0.1,\n",
" 'learning_rate': 0.1,\n",
" 'subsample': 1,\n",
" 'reg_lambda': 1,\n",
" 'scale_pos_weight': 2,\n",
" 'min_child_weight': 30,\n",
" 'tree_method': 'gpu_hist',\n",
" 'loss': 'ls',\n",
" 'objective': 'binary:logistic',\n",
" 'max_features': 'auto',\n",
" 'criterion': 'friedman_mse',\n",
" 'grow_policy': 'lossguide',\n",
" 'verbose': True\n",
" }\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Training using cudf"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<xgboost.core.Booster at 0x7f22248fceb8>"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model = xgboost.train(xgb_gpu_params,xgboost.DMatrix(df[['x']].compute(),df[['y']].compute()))\n",
"model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Training using dask-cudf"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<xgboost.core.Booster at 0x7f212814b4e0>"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model = dask_xgboost.train(client, xgb_gpu_params, df[['x']], df[['y']], \n",
" num_boost_round=xgb_gpu_params['nround'])\n",
"model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Training using dmatrix\n",
"##### Dmatrix requires that you have 1 dmatrix per GPU/Worker\n",
"* Below i first repartion the df by concatinating frames on the same GPU\n",
"* Then create DMatrix for training"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def repartition_cudf_join_on_gpu(df,client):\n",
" \"\"\"\n",
" This function repartitions the dataframe by joining all \n",
" keys that are on the same gpu\n",
" This only works with persisted DF\n",
" \"\"\"\n",
" dataframe_dask_keys = [str(key) for key in df.__dask_keys__() ]\n",
" client_dict = client.has_what()\n",
" \n",
" ### this creates a key map\n",
" ## where key is worker_ip\n",
" ## and values are the dataframe objects at that key\n",
" worker_key_map = {}\n",
" for worker_ip,worker_tasks in client_dict.items():\n",
" worker_key_map[worker_ip] = [delayed(lambda x:x)(dask_key_name = dask_key) for dask_key in worker_tasks if dask_key in dataframe_dask_keys]\n",
" \n",
" concatenated_df_task_ls = []\n",
" for list_delayed in worker_key_map.values():\n",
" concatenated_df_task_ls.append(delayed(cudf.concat)(list_delayed))\n",
" \n",
" ### persisting the tasks on GPU\n",
" concatenated_df_task_ls = [task.persist() for task in concatenated_df_task_ls]\n",
" done = wait(concatenated_df_task_ls)\n",
"\n",
"\n",
" return concatenated_df_task_ls\n",
"\n",
"\n",
"def get_dmatrix_from_persisted_df(df,label_col = 'y',non_label_columns=['x']):\n",
" gpu_dfs = [(gpu_df[[label_col]], gpu_df[non_label_columns]) for gpu_df in df.to_delayed()]\n",
" gpu_dfs = [(gpu_df[0].persist(), gpu_df[1].persist()) for gpu_df in gpu_dfs]\n",
"\n",
" dmat_ls = [dask.delayed(xgboost.DMatrix)(gpu_df[1], gpu_df[0]) for gpu_df in gpu_dfs]\n",
" dmat_ls = [dmat.persist() for dmat in dmat_ls]\n",
" wait(dmat_ls)\n",
" \n",
" return dmat_ls"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"repartitioned_df = dask_cudf.from_delayed(repartition_cudf_join_on_gpu(df,client))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"assert repartitioned_df.npartitions == len(client.scheduler_info()['workers'])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"dmat = get_dmatrix_from_persisted_df(repartitioned_df)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<xgboost.core.Booster at 0x7f20faff2fd0>"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model = dask_xgboost.train(client, xgb_gpu_params, dmat, None, num_boost_round=xgb_gpu_params['nround'])\n",
"model"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment