daxiongshu/rapids_lsst_demo.ipynb

## rapids_lsst_demo.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import cudf as gd\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import math\n",
    "import xgboost as xgb\n",
    "from functools import partial\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.model_selection import train_test_split\n",
    "import os\n",
    "import time\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = '0'\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "GPU_MEMORY = 32 # GB. please change it accordingly"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "TEST_ROWS = 453653104 # number of rows in test data\n",
    "# no skip if your gpu has 32 GB memory\n",
    "# otherwise, skip rows porportionally\n",
    "SKIP_ROWS = int((1 - GPU_MEMORY/32.0)*TEST_ROWS) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def multi_weighted_logloss(y_true, y_preds, classes, class_weights):\n",
    "    \"\"\"\n",
    "    refactor from\n",
    "    @author olivier https://www.kaggle.com/ogrellier\n",
    "    multi logloss for PLAsTiCC challenge\n",
    "    \"\"\"\n",
    "    y_p = y_preds.reshape(y_true.shape[0], len(classes), order='F')\n",
    "    # Trasform y_true in dummies\n",
    "    y_ohe = pd.get_dummies(y_true)\n",
    "    # Normalize rows and limit y_preds to 1e-15, 1-1e-15\n",
    "    y_p = np.clip(a=y_p, a_min=1e-15, a_max=1 - 1e-15)\n",
    "    # Transform to log\n",
    "    y_p_log = np.log(y_p)\n",
    "    # Get the log for ones, .values is used to drop the index of DataFrames\n",
    "    # Exclude class 99 for now, since there is no class99 in the training set\n",
    "    # we gave a special process for that class\n",
    "    y_log_ones = np.sum(y_ohe.values * y_p_log, axis=0)\n",
    "    # Get the number of positives for each class\n",
    "    nb_pos = y_ohe.sum(axis=0).values.astype(float)\n",
    "    # Weight average and divide by the number of positives\n",
    "    class_arr = np.array([class_weights[k] for k in sorted(class_weights.keys())])\n",
    "    y_w = y_log_ones * class_arr / nb_pos\n",
    "\n",
    "    loss = - np.sum(y_w) / np.sum(class_arr)\n",
    "    return loss\n",
    "\n",
    "def xgb_multi_weighted_logloss(y_predicted, y_true, classes, class_weights):\n",
    "    loss = multi_weighted_logloss(y_true.get_label(), y_predicted, \n",
    "                                  classes, class_weights)\n",
    "    return 'wloss', loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def ravel_column_names(cols):\n",
    "    d0 = cols.get_level_values(0)\n",
    "    d1 = cols.get_level_values(1)\n",
    "    return [\"%s_%s\"%(i,j) for i,j in zip(d0,d1)]\n",
    "    \n",
    "def etl_cpu(df,df_meta):\n",
    "    df['flux_ratio_sq'] = np.power(df['flux'] / df['flux_err'], 2.0)\n",
    "    df['flux_by_flux_ratio_sq'] = df['flux'] * df['flux_ratio_sq']\n",
    "    aggs = {\n",
    "        'passband': ['mean'], \n",
    "        'flux': ['min', 'max', 'mean'],\n",
    "        'flux_err': ['min', 'max', 'mean'],\n",
    "        'detected': ['mean'],\n",
    "        'mjd':['max','min'],\n",
    "        'flux_ratio_sq':['sum'],\n",
    "        'flux_by_flux_ratio_sq':['sum'],\n",
    "    }\n",
    "    agg_df = df.groupby('object_id').agg(aggs)\n",
    "    agg_df.columns = ravel_column_names(agg_df.columns)\n",
    "    \n",
    "    agg_df['flux_diff'] = agg_df['flux_max'] - agg_df['flux_min']\n",
    "    agg_df['flux_dif2'] = (agg_df['flux_max'] - agg_df['flux_min']) / agg_df['flux_mean']\n",
    "    agg_df['flux_w_mean'] = agg_df['flux_by_flux_ratio_sq_sum'] / agg_df['flux_ratio_sq_sum']\n",
    "    agg_df['flux_dif3'] = (agg_df['flux_max'] - agg_df['flux_min']) / agg_df['flux_w_mean']\n",
    "    \n",
    "    agg_df['mjd_diff'] = agg_df['mjd_max'] - agg_df['mjd_min']\n",
    "    agg_df = agg_df.drop(['mjd_max','mjd_min'],axis=1)\n",
    "    \n",
    "    agg_df = agg_df.reset_index()\n",
    "    df_meta = df_meta.drop(['ra','decl','gal_l','gal_b'],axis=1)\n",
    "    df_meta = df_meta.merge(agg_df,on='object_id',how='left')\n",
    "    return df_meta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# To save GPU memory, we drop the column as soon as it is done with groupby\n",
    "# \n",
    "# this hits performance a little but avoids GPU OOM.\n",
    "def groupby_aggs(df,aggs,col):\n",
    "    res = None\n",
    "    for i,j in aggs.items():\n",
    "        for k in j:\n",
    "            #print(i,k)\n",
    "            tmp = df.groupby(col).agg({i:[k]})\n",
    "            if res is None:\n",
    "                res = tmp\n",
    "            else:\n",
    "                res = res.merge(tmp,on=[col],how='left')\n",
    "        df.drop_column(i)\n",
    "    return res\n",
    "\n",
    "def etl_gpu(df,df_meta):\n",
    "    aggs = {\n",
    "        'passband': ['mean'], \n",
    "        'detected': ['mean'],\n",
    "        'mjd':['max','min'],\n",
    "    }\n",
    "    agg_df = groupby_aggs(df,aggs,'object_id')\n",
    "    # at this step, columns ['passband','detected','mjd'] are deleted \n",
    "    \n",
    "    df['flux_ratio_sq'] = df['flux'] / df['flux_err']\n",
    "    df['flux_ratio_sq'] = df['flux_ratio_sq'].applymap(lambda x: math.pow(x,2))\n",
    "    df['flux_by_flux_ratio_sq'] = df['flux'] * df['flux_ratio_sq']\n",
    "    \n",
    "    aggs2 = {\n",
    "        'flux_ratio_sq':['sum'],\n",
    "        'flux_by_flux_ratio_sq':['sum'],\n",
    "        'flux': ['min', 'max', 'mean'],\n",
    "        'flux_err': ['min', 'max', 'mean'],\n",
    "    }\n",
    "    agg_df2 = groupby_aggs(df,aggs2,'object_id')\n",
    "    agg_df = agg_df.merge(agg_df2,on=['object_id'],how='left')\n",
    "    del agg_df2\n",
    "\n",
    "    agg_df['flux_diff'] = agg_df['max_flux'] - agg_df['min_flux']\n",
    "    agg_df['flux_dif2'] = (agg_df['max_flux'] - agg_df['min_flux']) / agg_df['mean_flux']\n",
    "    agg_df['flux_w_mean'] = agg_df['sum_flux_by_flux_ratio_sq'] / agg_df['sum_flux_ratio_sq']\n",
    "    agg_df['flux_dif3'] = (agg_df['max_flux'] - agg_df['min_flux']) / agg_df['flux_w_mean']\n",
    "    \n",
    "    agg_df['mjd_diff'] = agg_df['max_mjd'] - agg_df['min_mjd']\n",
    "    agg_df.drop_column('max_mjd')\n",
    "    agg_df.drop_column('min_mjd')\n",
    "    \n",
    "    for col in ['ra','decl','gal_l','gal_b']:\n",
    "        df_meta.drop_column(col)\n",
    "    \n",
    "    df_meta = df_meta.merge(agg_df,on=['object_id'],how='left')\n",
    "    return df_meta"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load data for ETL part 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# please download data from https://www.kaggle.com/c/PLAsTiCC-2018/data\n",
    "PATH = '../lsst/input'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 13.4 s, sys: 8.29 s, total: 21.7 s\n",
      "Wall time: 21.7 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# read data on gpu\n",
    "ts_cols = ['object_id', 'mjd', 'passband', 'flux', 'flux_err', 'detected']\n",
    "ts_dtypes = ['int32', 'float32', 'int32', 'float32','float32','int32']\n",
    "\n",
    "train_gd = gd.read_csv('%s/training_set.csv'%PATH,\n",
    "            names=ts_cols,dtype=ts_dtypes,skiprows=1)\n",
    "test_gd = gd.read_csv('%s/test_set.csv'%PATH,\n",
    "            names=ts_cols,dtype=ts_dtypes,skiprows=1+SKIP_ROWS) # skip the header"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ETL with 120x ~ 140x  speedup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 8 ms, sys: 8 ms, total: 16 ms\n",
      "Wall time: 10 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# to save memory, we need to move dataframe to cpu and only keep the columns we need\n",
    "test_gd = test_gd[['object_id','flux']]\n",
    "train_gd = train_gd[['object_id','flux']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 9.65 s, sys: 7.06 s, total: 16.7 s\n",
      "Wall time: 2.3 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "test = test_gd.to_pandas()\n",
    "train = train_gd.to_pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from cudf_workaround import cudf_groupby_aggs\n",
    "\"\"\"\n",
    "def cudf_groupby_aggs(df,group_id_col,aggs):\n",
    "    \n",
    "    Parameters\n",
    "    ----------\n",
    "    df : cudf dataframe\n",
    "        dataframe to be grouped\n",
    "    group_id_col : string\n",
    "        name of the column which is used as the key of the group\n",
    "    aggs : dictionary\n",
    "        key is the name of column for which aggregation is calculated\n",
    "        values is the name of function for aggregation\n",
    "    Returns\n",
    "    -------\n",
    "    dg : cudf dataframe\n",
    "        result of groupby aggregation\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 10.3 s, sys: 1.04 s, total: 11.3 s\n",
      "Wall time: 4.83 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# GPU\n",
    "aggs = {'flux':['skew']}\n",
    "test_gd = cudf_groupby_aggs(test_gd,group_id_col='object_id',aggs=aggs)\n",
    "train_gd = cudf_groupby_aggs(train_gd,group_id_col='object_id',aggs=aggs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 10min 22s, sys: 24.8 s, total: 10min 47s\n",
      "Wall time: 10min 32s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# CPU\n",
    "test = test.groupby('object_id').agg(aggs)\n",
    "train = train.groupby('object_id').agg(aggs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "we achieve 130.849 speedup!\n"
     ]
    }
   ],
   "source": [
    "speedup = (10*60 + 32)/4.83\n",
    "print(\"we achieve %.3f speedup!\"%speedup)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 8 ms, sys: 32 ms, total: 40 ms\n",
      "Wall time: 39.8 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "test_gd = test_gd.sort_values(by='object_id')\n",
    "train_gd = train_gd.sort_values(by='object_id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 7.02 s, sys: 24 ms, total: 7.04 s\n",
      "Wall time: 176 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "test.columns = ['skew_flux']\n",
    "test = test.reset_index()\n",
    "test = test.sort_values(by='object_id')\n",
    "train.columns = ['skew_flux']\n",
    "train = train.reset_index()\n",
    "train = train.sort_values(by='object_id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test\n",
      "object_id, rmse 0.000000\n",
      "skew_flux, rmse 0.000000\n",
      "train\n",
      "object_id, rmse 0.000000\n",
      "skew_flux, rmse 0.000001\n"
     ]
    }
   ],
   "source": [
    "def rmse(a,b):\n",
    "    return np.mean((a-b)**2)**0.5\n",
    "print('test')\n",
    "for col in test.columns:\n",
    "    if col in test_gd.columns:\n",
    "        print(\"%s, rmse %.6f\"%(col,rmse(test[col].values,test_gd[col].to_pandas().values)))\n",
    "print('train')\n",
    "for col in train.columns:\n",
    "    if col in train_gd.columns:\n",
    "        print(\"%s, rmse %.6f\"%(col,rmse(train[col].values,train_gd[col].to_pandas().values)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3492890 3492890\n"
     ]
    }
   ],
   "source": [
    "test_flux_skew_gd = test_gd\n",
    "test_flux_skew = test\n",
    "train_flux_skew_gd = train_gd\n",
    "train_flux_skew = train\n",
    "print(len(test_gd),len(test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load data with 11x speedup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4min 2s, sys: 50.7 s, total: 4min 52s\n",
      "Wall time: 3min 45s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# read data on cpu\n",
    "test = pd.read_csv('%s/test_set.csv'%PATH)\n",
    "test_meta = pd.read_csv('%s/test_set_metadata.csv'%PATH)\n",
    "\n",
    "train = pd.read_csv('%s/training_set.csv'%PATH)\n",
    "train_meta = pd.read_csv('%s/training_set_metadata.csv'%PATH)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 19.7 s, sys: 5.46 s, total: 25.2 s\n",
      "Wall time: 18.7 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# read data on gpu\n",
    "ts_cols = ['object_id', 'mjd', 'passband', 'flux', 'flux_err', 'detected']\n",
    "ts_dtypes = ['int32', 'float32', 'int32', 'float32','float32','int32']\n",
    "\n",
    "test_gd = gd.read_csv('%s/test_set.csv'%PATH,\n",
    "            names=ts_cols,dtype=ts_dtypes,skiprows=1+SKIP_ROWS) # skip the header\n",
    "train_gd = gd.read_csv('%s/training_set.csv'%PATH,\n",
    "            names=ts_cols,dtype=ts_dtypes,skiprows=1)\n",
    "\n",
    "cols = ['object_id', 'ra', 'decl', 'gal_l', 'gal_b', 'ddf',\n",
    "       'hostgal_specz', 'hostgal_photoz', 'hostgal_photoz_err', \n",
    "       'distmod','mwebv', 'target']\n",
    "dtypes = ['int32']+['float32']*4+['int32']+['float32']*5+['int32']\n",
    "\n",
    "train_meta_gd = gd.read_csv('%s/training_set_metadata.csv'%PATH,\n",
    "            names=cols,dtype=dtypes,skiprows=1)\n",
    "del cols[-1],dtypes[-1]\n",
    "test_meta_gd = gd.read_csv('%s/test_set_metadata.csv'%PATH,\n",
    "            names=cols,dtype=dtypes,skiprows=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "we achieve 12.032 speedup!\n"
     ]
    }
   ],
   "source": [
    "speedup = (3*60 + 45)/18.7\n",
    "print(\"we achieve %.3f speedup!\"%speedup)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ETL with 9x ~ 12x speedup "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1:int32\n",
      "2:int32\n",
      "3:int32\n",
      "1:int32\n",
      "1:int32\n",
      "2:int32\n",
      "3:float32\n",
      "1:int32\n",
      "1:int32\n",
      "1:float32\n",
      "2:int32\n",
      "3:float32\n",
      "1:float32\n",
      "2:int32\n",
      "3:float32\n",
      "1:float32\n",
      "1:float32\n",
      "2:int32\n",
      "3:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "2:int32\n",
      "3:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "2:int32\n",
      "3:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "2:int32\n",
      "3:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "2:int32\n",
      "3:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "2:int32\n",
      "3:float32\n",
      "1:int32\n",
      "1:int32\n",
      "1:float32\n",
      "1:float32\n",
      "2:int32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "1:int32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:int32\n",
      "2:int32\n",
      "3:int32\n",
      "3:int32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "1:int32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:int32\n",
      "1:int32\n",
      "1:int32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "2:int32\n",
      "3:float32\n",
      "1:int32\n",
      "2:int32\n",
      "3:int32\n",
      "1:int32\n",
      "1:int32\n",
      "2:int32\n",
      "3:float32\n",
      "1:int32\n",
      "1:int32\n",
      "1:float32\n",
      "2:int32\n",
      "3:float32\n",
      "1:float32\n",
      "2:int32\n",
      "3:float32\n",
      "1:float32\n",
      "1:float32\n",
      "2:int32\n",
      "3:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "2:int32\n",
      "3:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "2:int32\n",
      "3:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "2:int32\n",
      "3:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "2:int32\n",
      "3:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "2:int32\n",
      "3:float32\n",
      "1:int32\n",
      "1:int32\n",
      "1:float32\n",
      "1:float32\n",
      "2:int32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "1:int32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "2:int32\n",
      "3:int32\n",
      "3:int32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "3:float32\n",
      "1:int32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:int32\n",
      "1:int32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "1:float32\n",
      "2:int32\n",
      "3:float32\n",
      "CPU times: user 4.29 s, sys: 3.08 s, total: 7.37 s\n",
      "Wall time: 11.5 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# GPU\n",
    "train_final_gd = etl_gpu(train_gd,train_meta_gd)\n",
    "train_final_gd = train_final_gd.merge(train_flux_skew_gd,on=['object_id'],how='left')\n",
    "test_final_gd = etl_gpu(test_gd,test_meta_gd)\n",
    "del test_gd,test_meta_gd\n",
    "test_final_gd = test_final_gd.merge(test_flux_skew_gd,on=['object_id'],how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 3min 56s, sys: 1min 42s, total: 5min 39s\n",
      "Wall time: 1min 57s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "#CPU\n",
    "train_final = etl_cpu(train,train_meta)\n",
    "train_final = train_final.merge(train_flux_skew,on=['object_id'],how='left')\n",
    "test_final = etl_cpu(test,test_meta)\n",
    "test_final = test_final.merge(test_flux_skew,on=['object_id'],how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "we achieve 10.174 speedup!\n"
     ]
    }
   ],
   "source": [
    "speedup = (1*60 + 57)/11.5\n",
    "print(\"we achieve %.3f speedup!\"%speedup)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# train and validation with 5x speedup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 6 15 16 42 52 53 62 64 65 67 88 90 92 95]\n"
     ]
    }
   ],
   "source": [
    "# CPU\n",
    "X = train_final.drop(['object_id','target'],axis=1).values\n",
    "y = train_final['target']\n",
    "Xt = test_final.drop(['object_id'],axis=1).values\n",
    "assert X.shape[1] == Xt.shape[1]\n",
    "classes = sorted(y.unique())    \n",
    "# Taken from Giba's topic : https://www.kaggle.com/titericz\n",
    "# https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194\n",
    "# with Kyle Boone's post https://www.kaggle.com/kyleboone\n",
    "class_weights = {c: 1 for c in classes}\n",
    "class_weights.update({c:2 for c in [64, 15]})\n",
    "\n",
    "lbl = LabelEncoder()\n",
    "y = lbl.fit_transform(y)\n",
    "print(lbl.classes_)\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,stratify=y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "cpu_params = {\n",
    "            'objective': 'multi:softprob', \n",
    "            'tree_method': 'hist', \n",
    "            'nthread': 16, \n",
    "            'num_class':14,\n",
    "            'max_depth': 7, \n",
    "            'silent':1,\n",
    "            'subsample':0.7,\n",
    "            'colsample_bytree': 0.7,}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "func_loss = partial(xgb_multi_weighted_logloss, \n",
    "                        classes=classes, \n",
    "                        class_weights=class_weights)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[02:47:23] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.\n",
      "[0]\teval-merror:0.326115\ttrain-merror:0.272122\teval-wloss:2.03443\ttrain-wloss:1.88621\n",
      "Multiple eval metrics have been passed: 'train-wloss' will be used for early stopping.\n",
      "\n",
      "Will train until train-wloss hasn't improved in 10 rounds.\n",
      "[59]\teval-merror:0.276433\ttrain-merror:0.001982\teval-wloss:1.33349\ttrain-wloss:0.091674\n",
      "validation loss 1.3335\n",
      "CPU times: user 6min 38s, sys: 1.43 s, total: 6min 40s\n",
      "Wall time: 26.5 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "dtrain = xgb.DMatrix(data=X_train, label=y_train)\n",
    "dvalid = xgb.DMatrix(data=X_test, label=y_test)\n",
    "dtest = xgb.DMatrix(data=Xt)\n",
    "watchlist = [(dvalid, 'eval'), (dtrain, 'train')]\n",
    "clf = xgb.train(cpu_params, dtrain=dtrain,\n",
    "                num_boost_round=60,evals=watchlist,\n",
    "                feval=func_loss,early_stopping_rounds=10,\n",
    "                verbose_eval=1000)\n",
    "yp = clf.predict(dvalid)\n",
    "loss = multi_weighted_logloss(y_test, yp, classes, class_weights)\n",
    "ysub = clf.predict(dtest)\n",
    "print('validation loss %.4f'%loss)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "# GPU\n",
    "y = train_final_gd['target'].to_array()\n",
    "y = lbl.fit_transform(y)\n",
    "for col in ['object_id','target']:\n",
    "    train_final_gd.drop_column(col)\n",
    "for col in train_final_gd.columns:\n",
    "    train_final_gd[col] = train_final_gd[col].fillna(0).astype('float32')\n",
    "\n",
    "\n",
    "for col in ['object_id']:\n",
    "    test_final_gd.drop_column(col)\n",
    "for col in test_final_gd.columns:\n",
    "    test_final_gd[col] = test_final_gd[col].fillna(0).astype('float32')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "cols = [i for i in test_final_gd.columns]\n",
    "X = train_final_gd[cols].as_matrix()\n",
    "Xt = test_final_gd.as_matrix()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,stratify=y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "# GPU\n",
    "gpu_params = cpu_params.copy()\n",
    "gpu_params.update({'objective': 'multi:softprob',\n",
    "                   'tree_method': 'gpu_hist', \n",
    "                  })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0]\teval-merror:0.338854\ttrain-merror:0.296899\teval-wloss:1.98145\ttrain-wloss:1.8993\n",
      "Multiple eval metrics have been passed: 'train-wloss' will be used for early stopping.\n",
      "\n",
      "Will train until train-wloss hasn't improved in 10 rounds.\n",
      "[59]\teval-merror:0.26242\ttrain-merror:0.002832\teval-wloss:1.13939\ttrain-wloss:0.102957\n",
      "validation loss 1.1394\n",
      "CPU times: user 1min 16s, sys: 1.12 s, total: 1min 17s\n",
      "Wall time: 5.68 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "dtrain = xgb.DMatrix(data=X_train, label=y_train)\n",
    "dvalid = xgb.DMatrix(data=X_test, label=y_test)\n",
    "dtest = xgb.DMatrix(data=Xt)\n",
    "watchlist = [(dvalid, 'eval'), (dtrain, 'train')]\n",
    "clf = xgb.train(gpu_params, dtrain=dtrain,\n",
    "                num_boost_round=60,evals=watchlist,\n",
    "                feval=func_loss,early_stopping_rounds=10,\n",
    "                verbose_eval=1000)\n",
    "yp = clf.predict(dvalid)\n",
    "loss = multi_weighted_logloss(y_test, yp, classes, class_weights)\n",
    "ysub = clf.predict(dtest)\n",
    "print('validation loss %.4f'%loss)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "we achieve 4.665 speedup!\n"
     ]
    }
   ],
   "source": [
    "speedup = 26.5/5.68\n",
    "print(\"we achieve %.3f speedup!\"%speedup)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "we achieve 24.594 speedup in total!\n"
     ]
    }
   ],
   "source": [
    "total_speedup = (10*60 + 32 + 3*60 + 45 + 1*60 + 57 + 26.5)/(4.8+18.7+11.5+5.68)\n",
    "print(\"we achieve %.3f speedup in total!\"%total_speedup)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "40.68"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "4.8+18.7+11.5+5.68"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}