Skip to content

Instantly share code, notes, and snippets.

@daxiongshu
Last active January 16, 2019 20:11
Show Gist options
  • Save daxiongshu/57d079b8233bcfa25787ca70649cd11a to your computer and use it in GitHub Desktop.
Save daxiongshu/57d079b8233bcfa25787ca70649cd11a to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import cudf as gd\n",
"import pandas as pd\n",
"import numpy as np\n",
"import math\n",
"import xgboost as xgb\n",
"from functools import partial\n",
"from sklearn.preprocessing import LabelEncoder\n",
"from sklearn.model_selection import train_test_split\n",
"import os\n",
"import time\n",
"os.environ['CUDA_VISIBLE_DEVICES'] = '0'\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"GPU_MEMORY = 32 # GB. please change it accordingly"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"TEST_ROWS = 453653104 # number of rows in test data\n",
"# no skip if your gpu has 32 GB memory\n",
"# otherwise, skip rows porportionally\n",
"SKIP_ROWS = int((1 - GPU_MEMORY/32.0)*TEST_ROWS) "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Functions"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def multi_weighted_logloss(y_true, y_preds, classes, class_weights):\n",
" \"\"\"\n",
" refactor from\n",
" @author olivier https://www.kaggle.com/ogrellier\n",
" multi logloss for PLAsTiCC challenge\n",
" \"\"\"\n",
" y_p = y_preds.reshape(y_true.shape[0], len(classes), order='F')\n",
" # Trasform y_true in dummies\n",
" y_ohe = pd.get_dummies(y_true)\n",
" # Normalize rows and limit y_preds to 1e-15, 1-1e-15\n",
" y_p = np.clip(a=y_p, a_min=1e-15, a_max=1 - 1e-15)\n",
" # Transform to log\n",
" y_p_log = np.log(y_p)\n",
" # Get the log for ones, .values is used to drop the index of DataFrames\n",
" # Exclude class 99 for now, since there is no class99 in the training set\n",
" # we gave a special process for that class\n",
" y_log_ones = np.sum(y_ohe.values * y_p_log, axis=0)\n",
" # Get the number of positives for each class\n",
" nb_pos = y_ohe.sum(axis=0).values.astype(float)\n",
" # Weight average and divide by the number of positives\n",
" class_arr = np.array([class_weights[k] for k in sorted(class_weights.keys())])\n",
" y_w = y_log_ones * class_arr / nb_pos\n",
"\n",
" loss = - np.sum(y_w) / np.sum(class_arr)\n",
" return loss\n",
"\n",
"def xgb_multi_weighted_logloss(y_predicted, y_true, classes, class_weights):\n",
" loss = multi_weighted_logloss(y_true.get_label(), y_predicted, \n",
" classes, class_weights)\n",
" return 'wloss', loss"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def ravel_column_names(cols):\n",
" d0 = cols.get_level_values(0)\n",
" d1 = cols.get_level_values(1)\n",
" return [\"%s_%s\"%(i,j) for i,j in zip(d0,d1)]\n",
" \n",
"def etl_cpu(df,df_meta):\n",
" df['flux_ratio_sq'] = np.power(df['flux'] / df['flux_err'], 2.0)\n",
" df['flux_by_flux_ratio_sq'] = df['flux'] * df['flux_ratio_sq']\n",
" aggs = {\n",
" 'passband': ['mean'], \n",
" 'flux': ['min', 'max', 'mean'],\n",
" 'flux_err': ['min', 'max', 'mean'],\n",
" 'detected': ['mean'],\n",
" 'mjd':['max','min'],\n",
" 'flux_ratio_sq':['sum'],\n",
" 'flux_by_flux_ratio_sq':['sum'],\n",
" }\n",
" agg_df = df.groupby('object_id').agg(aggs)\n",
" agg_df.columns = ravel_column_names(agg_df.columns)\n",
" \n",
" agg_df['flux_diff'] = agg_df['flux_max'] - agg_df['flux_min']\n",
" agg_df['flux_dif2'] = (agg_df['flux_max'] - agg_df['flux_min']) / agg_df['flux_mean']\n",
" agg_df['flux_w_mean'] = agg_df['flux_by_flux_ratio_sq_sum'] / agg_df['flux_ratio_sq_sum']\n",
" agg_df['flux_dif3'] = (agg_df['flux_max'] - agg_df['flux_min']) / agg_df['flux_w_mean']\n",
" \n",
" agg_df['mjd_diff'] = agg_df['mjd_max'] - agg_df['mjd_min']\n",
" agg_df = agg_df.drop(['mjd_max','mjd_min'],axis=1)\n",
" \n",
" agg_df = agg_df.reset_index()\n",
" df_meta = df_meta.drop(['ra','decl','gal_l','gal_b'],axis=1)\n",
" df_meta = df_meta.merge(agg_df,on='object_id',how='left')\n",
" return df_meta"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# To save GPU memory, we drop the column as soon as it is done with groupby\n",
"# \n",
"# this hits performance a little but avoids GPU OOM.\n",
"def groupby_aggs(df,aggs,col):\n",
" res = None\n",
" for i,j in aggs.items():\n",
" for k in j:\n",
" #print(i,k)\n",
" tmp = df.groupby(col).agg({i:[k]})\n",
" if res is None:\n",
" res = tmp\n",
" else:\n",
" res = res.merge(tmp,on=[col],how='left')\n",
" df.drop_column(i)\n",
" return res\n",
"\n",
"def etl_gpu(df,df_meta):\n",
" aggs = {\n",
" 'passband': ['mean'], \n",
" 'detected': ['mean'],\n",
" 'mjd':['max','min'],\n",
" }\n",
" agg_df = groupby_aggs(df,aggs,'object_id')\n",
" # at this step, columns ['passband','detected','mjd'] are deleted \n",
" \n",
" df['flux_ratio_sq'] = df['flux'] / df['flux_err']\n",
" df['flux_ratio_sq'] = df['flux_ratio_sq'].applymap(lambda x: math.pow(x,2))\n",
" df['flux_by_flux_ratio_sq'] = df['flux'] * df['flux_ratio_sq']\n",
" \n",
" aggs2 = {\n",
" 'flux_ratio_sq':['sum'],\n",
" 'flux_by_flux_ratio_sq':['sum'],\n",
" 'flux': ['min', 'max', 'mean'],\n",
" 'flux_err': ['min', 'max', 'mean'],\n",
" }\n",
" agg_df2 = groupby_aggs(df,aggs2,'object_id')\n",
" agg_df = agg_df.merge(agg_df2,on=['object_id'],how='left')\n",
" del agg_df2\n",
"\n",
" agg_df['flux_diff'] = agg_df['max_flux'] - agg_df['min_flux']\n",
" agg_df['flux_dif2'] = (agg_df['max_flux'] - agg_df['min_flux']) / agg_df['mean_flux']\n",
" agg_df['flux_w_mean'] = agg_df['sum_flux_by_flux_ratio_sq'] / agg_df['sum_flux_ratio_sq']\n",
" agg_df['flux_dif3'] = (agg_df['max_flux'] - agg_df['min_flux']) / agg_df['flux_w_mean']\n",
" \n",
" agg_df['mjd_diff'] = agg_df['max_mjd'] - agg_df['min_mjd']\n",
" agg_df.drop_column('max_mjd')\n",
" agg_df.drop_column('min_mjd')\n",
" \n",
" for col in ['ra','decl','gal_l','gal_b']:\n",
" df_meta.drop_column(col)\n",
" \n",
" df_meta = df_meta.merge(agg_df,on=['object_id'],how='left')\n",
" return df_meta"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load data for ETL part 1"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# please download data from https://www.kaggle.com/c/PLAsTiCC-2018/data\n",
"PATH = '../lsst/input'"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 13.4 s, sys: 8.29 s, total: 21.7 s\n",
"Wall time: 21.7 s\n"
]
}
],
"source": [
"%%time\n",
"# read data on gpu\n",
"ts_cols = ['object_id', 'mjd', 'passband', 'flux', 'flux_err', 'detected']\n",
"ts_dtypes = ['int32', 'float32', 'int32', 'float32','float32','int32']\n",
"\n",
"train_gd = gd.read_csv('%s/training_set.csv'%PATH,\n",
" names=ts_cols,dtype=ts_dtypes,skiprows=1)\n",
"test_gd = gd.read_csv('%s/test_set.csv'%PATH,\n",
" names=ts_cols,dtype=ts_dtypes,skiprows=1+SKIP_ROWS) # skip the header"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ETL with 120x ~ 140x speedup"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 8 ms, sys: 8 ms, total: 16 ms\n",
"Wall time: 10 ms\n"
]
}
],
"source": [
"%%time\n",
"# to save memory, we need to move dataframe to cpu and only keep the columns we need\n",
"test_gd = test_gd[['object_id','flux']]\n",
"train_gd = train_gd[['object_id','flux']]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 9.65 s, sys: 7.06 s, total: 16.7 s\n",
"Wall time: 2.3 s\n"
]
}
],
"source": [
"%%time\n",
"test = test_gd.to_pandas()\n",
"train = train_gd.to_pandas()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"from cudf_workaround import cudf_groupby_aggs\n",
"\"\"\"\n",
"def cudf_groupby_aggs(df,group_id_col,aggs):\n",
" \n",
" Parameters\n",
" ----------\n",
" df : cudf dataframe\n",
" dataframe to be grouped\n",
" group_id_col : string\n",
" name of the column which is used as the key of the group\n",
" aggs : dictionary\n",
" key is the name of column for which aggregation is calculated\n",
" values is the name of function for aggregation\n",
" Returns\n",
" -------\n",
" dg : cudf dataframe\n",
" result of groupby aggregation\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 10.3 s, sys: 1.04 s, total: 11.3 s\n",
"Wall time: 4.83 s\n"
]
}
],
"source": [
"%%time\n",
"# GPU\n",
"aggs = {'flux':['skew']}\n",
"test_gd = cudf_groupby_aggs(test_gd,group_id_col='object_id',aggs=aggs)\n",
"train_gd = cudf_groupby_aggs(train_gd,group_id_col='object_id',aggs=aggs)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 10min 22s, sys: 24.8 s, total: 10min 47s\n",
"Wall time: 10min 32s\n"
]
}
],
"source": [
"%%time\n",
"# CPU\n",
"test = test.groupby('object_id').agg(aggs)\n",
"train = train.groupby('object_id').agg(aggs)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"we achieve 130.849 speedup!\n"
]
}
],
"source": [
"speedup = (10*60 + 32)/4.83\n",
"print(\"we achieve %.3f speedup!\"%speedup)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 8 ms, sys: 32 ms, total: 40 ms\n",
"Wall time: 39.8 ms\n"
]
}
],
"source": [
"%%time\n",
"test_gd = test_gd.sort_values(by='object_id')\n",
"train_gd = train_gd.sort_values(by='object_id')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 7.02 s, sys: 24 ms, total: 7.04 s\n",
"Wall time: 176 ms\n"
]
}
],
"source": [
"%%time\n",
"test.columns = ['skew_flux']\n",
"test = test.reset_index()\n",
"test = test.sort_values(by='object_id')\n",
"train.columns = ['skew_flux']\n",
"train = train.reset_index()\n",
"train = train.sort_values(by='object_id')"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"test\n",
"object_id, rmse 0.000000\n",
"skew_flux, rmse 0.000000\n",
"train\n",
"object_id, rmse 0.000000\n",
"skew_flux, rmse 0.000001\n"
]
}
],
"source": [
"def rmse(a,b):\n",
" return np.mean((a-b)**2)**0.5\n",
"print('test')\n",
"for col in test.columns:\n",
" if col in test_gd.columns:\n",
" print(\"%s, rmse %.6f\"%(col,rmse(test[col].values,test_gd[col].to_pandas().values)))\n",
"print('train')\n",
"for col in train.columns:\n",
" if col in train_gd.columns:\n",
" print(\"%s, rmse %.6f\"%(col,rmse(train[col].values,train_gd[col].to_pandas().values)))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3492890 3492890\n"
]
}
],
"source": [
"test_flux_skew_gd = test_gd\n",
"test_flux_skew = test\n",
"train_flux_skew_gd = train_gd\n",
"train_flux_skew = train\n",
"print(len(test_gd),len(test))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load data with 11x speedup"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 4min 2s, sys: 50.7 s, total: 4min 52s\n",
"Wall time: 3min 45s\n"
]
}
],
"source": [
"%%time\n",
"# read data on cpu\n",
"test = pd.read_csv('%s/test_set.csv'%PATH)\n",
"test_meta = pd.read_csv('%s/test_set_metadata.csv'%PATH)\n",
"\n",
"train = pd.read_csv('%s/training_set.csv'%PATH)\n",
"train_meta = pd.read_csv('%s/training_set_metadata.csv'%PATH)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 19.7 s, sys: 5.46 s, total: 25.2 s\n",
"Wall time: 18.7 s\n"
]
}
],
"source": [
"%%time\n",
"# read data on gpu\n",
"ts_cols = ['object_id', 'mjd', 'passband', 'flux', 'flux_err', 'detected']\n",
"ts_dtypes = ['int32', 'float32', 'int32', 'float32','float32','int32']\n",
"\n",
"test_gd = gd.read_csv('%s/test_set.csv'%PATH,\n",
" names=ts_cols,dtype=ts_dtypes,skiprows=1+SKIP_ROWS) # skip the header\n",
"train_gd = gd.read_csv('%s/training_set.csv'%PATH,\n",
" names=ts_cols,dtype=ts_dtypes,skiprows=1)\n",
"\n",
"cols = ['object_id', 'ra', 'decl', 'gal_l', 'gal_b', 'ddf',\n",
" 'hostgal_specz', 'hostgal_photoz', 'hostgal_photoz_err', \n",
" 'distmod','mwebv', 'target']\n",
"dtypes = ['int32']+['float32']*4+['int32']+['float32']*5+['int32']\n",
"\n",
"train_meta_gd = gd.read_csv('%s/training_set_metadata.csv'%PATH,\n",
" names=cols,dtype=dtypes,skiprows=1)\n",
"del cols[-1],dtypes[-1]\n",
"test_meta_gd = gd.read_csv('%s/test_set_metadata.csv'%PATH,\n",
" names=cols,dtype=dtypes,skiprows=1)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"we achieve 12.032 speedup!\n"
]
}
],
"source": [
"speedup = (3*60 + 45)/18.7\n",
"print(\"we achieve %.3f speedup!\"%speedup)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ETL with 9x ~ 12x speedup "
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1:int32\n",
"2:int32\n",
"3:int32\n",
"1:int32\n",
"1:int32\n",
"2:int32\n",
"3:float32\n",
"1:int32\n",
"1:int32\n",
"1:float32\n",
"2:int32\n",
"3:float32\n",
"1:float32\n",
"2:int32\n",
"3:float32\n",
"1:float32\n",
"1:float32\n",
"2:int32\n",
"3:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"2:int32\n",
"3:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"2:int32\n",
"3:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"2:int32\n",
"3:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"2:int32\n",
"3:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"2:int32\n",
"3:float32\n",
"1:int32\n",
"1:int32\n",
"1:float32\n",
"1:float32\n",
"2:int32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"1:int32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:int32\n",
"2:int32\n",
"3:int32\n",
"3:int32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"1:int32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:int32\n",
"1:int32\n",
"1:int32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"2:int32\n",
"3:float32\n",
"1:int32\n",
"2:int32\n",
"3:int32\n",
"1:int32\n",
"1:int32\n",
"2:int32\n",
"3:float32\n",
"1:int32\n",
"1:int32\n",
"1:float32\n",
"2:int32\n",
"3:float32\n",
"1:float32\n",
"2:int32\n",
"3:float32\n",
"1:float32\n",
"1:float32\n",
"2:int32\n",
"3:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"2:int32\n",
"3:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"2:int32\n",
"3:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"2:int32\n",
"3:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"2:int32\n",
"3:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"2:int32\n",
"3:float32\n",
"1:int32\n",
"1:int32\n",
"1:float32\n",
"1:float32\n",
"2:int32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"1:int32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"2:int32\n",
"3:int32\n",
"3:int32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"3:float32\n",
"1:int32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:int32\n",
"1:int32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"1:float32\n",
"2:int32\n",
"3:float32\n",
"CPU times: user 4.29 s, sys: 3.08 s, total: 7.37 s\n",
"Wall time: 11.5 s\n"
]
}
],
"source": [
"%%time\n",
"# GPU\n",
"train_final_gd = etl_gpu(train_gd,train_meta_gd)\n",
"train_final_gd = train_final_gd.merge(train_flux_skew_gd,on=['object_id'],how='left')\n",
"test_final_gd = etl_gpu(test_gd,test_meta_gd)\n",
"del test_gd,test_meta_gd\n",
"test_final_gd = test_final_gd.merge(test_flux_skew_gd,on=['object_id'],how='left')"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 3min 56s, sys: 1min 42s, total: 5min 39s\n",
"Wall time: 1min 57s\n"
]
}
],
"source": [
"%%time\n",
"#CPU\n",
"train_final = etl_cpu(train,train_meta)\n",
"train_final = train_final.merge(train_flux_skew,on=['object_id'],how='left')\n",
"test_final = etl_cpu(test,test_meta)\n",
"test_final = test_final.merge(test_flux_skew,on=['object_id'],how='left')"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"we achieve 10.174 speedup!\n"
]
}
],
"source": [
"speedup = (1*60 + 57)/11.5\n",
"print(\"we achieve %.3f speedup!\"%speedup)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# train and validation with 5x speedup"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ 6 15 16 42 52 53 62 64 65 67 88 90 92 95]\n"
]
}
],
"source": [
"# CPU\n",
"X = train_final.drop(['object_id','target'],axis=1).values\n",
"y = train_final['target']\n",
"Xt = test_final.drop(['object_id'],axis=1).values\n",
"assert X.shape[1] == Xt.shape[1]\n",
"classes = sorted(y.unique()) \n",
"# Taken from Giba's topic : https://www.kaggle.com/titericz\n",
"# https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194\n",
"# with Kyle Boone's post https://www.kaggle.com/kyleboone\n",
"class_weights = {c: 1 for c in classes}\n",
"class_weights.update({c:2 for c in [64, 15]})\n",
"\n",
"lbl = LabelEncoder()\n",
"y = lbl.fit_transform(y)\n",
"print(lbl.classes_)\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,stratify=y)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"cpu_params = {\n",
" 'objective': 'multi:softprob', \n",
" 'tree_method': 'hist', \n",
" 'nthread': 16, \n",
" 'num_class':14,\n",
" 'max_depth': 7, \n",
" 'silent':1,\n",
" 'subsample':0.7,\n",
" 'colsample_bytree': 0.7,}"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"func_loss = partial(xgb_multi_weighted_logloss, \n",
" classes=classes, \n",
" class_weights=class_weights)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[02:47:23] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.\n",
"[0]\teval-merror:0.326115\ttrain-merror:0.272122\teval-wloss:2.03443\ttrain-wloss:1.88621\n",
"Multiple eval metrics have been passed: 'train-wloss' will be used for early stopping.\n",
"\n",
"Will train until train-wloss hasn't improved in 10 rounds.\n",
"[59]\teval-merror:0.276433\ttrain-merror:0.001982\teval-wloss:1.33349\ttrain-wloss:0.091674\n",
"validation loss 1.3335\n",
"CPU times: user 6min 38s, sys: 1.43 s, total: 6min 40s\n",
"Wall time: 26.5 s\n"
]
}
],
"source": [
"%%time\n",
"dtrain = xgb.DMatrix(data=X_train, label=y_train)\n",
"dvalid = xgb.DMatrix(data=X_test, label=y_test)\n",
"dtest = xgb.DMatrix(data=Xt)\n",
"watchlist = [(dvalid, 'eval'), (dtrain, 'train')]\n",
"clf = xgb.train(cpu_params, dtrain=dtrain,\n",
" num_boost_round=60,evals=watchlist,\n",
" feval=func_loss,early_stopping_rounds=10,\n",
" verbose_eval=1000)\n",
"yp = clf.predict(dvalid)\n",
"loss = multi_weighted_logloss(y_test, yp, classes, class_weights)\n",
"ysub = clf.predict(dtest)\n",
"print('validation loss %.4f'%loss)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"# GPU\n",
"y = train_final_gd['target'].to_array()\n",
"y = lbl.fit_transform(y)\n",
"for col in ['object_id','target']:\n",
" train_final_gd.drop_column(col)\n",
"for col in train_final_gd.columns:\n",
" train_final_gd[col] = train_final_gd[col].fillna(0).astype('float32')\n",
"\n",
"\n",
"for col in ['object_id']:\n",
" test_final_gd.drop_column(col)\n",
"for col in test_final_gd.columns:\n",
" test_final_gd[col] = test_final_gd[col].fillna(0).astype('float32')"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"cols = [i for i in test_final_gd.columns]\n",
"X = train_final_gd[cols].as_matrix()\n",
"Xt = test_final_gd.as_matrix()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,stratify=y)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"# GPU\n",
"gpu_params = cpu_params.copy()\n",
"gpu_params.update({'objective': 'multi:softprob',\n",
" 'tree_method': 'gpu_hist', \n",
" })"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0]\teval-merror:0.338854\ttrain-merror:0.296899\teval-wloss:1.98145\ttrain-wloss:1.8993\n",
"Multiple eval metrics have been passed: 'train-wloss' will be used for early stopping.\n",
"\n",
"Will train until train-wloss hasn't improved in 10 rounds.\n",
"[59]\teval-merror:0.26242\ttrain-merror:0.002832\teval-wloss:1.13939\ttrain-wloss:0.102957\n",
"validation loss 1.1394\n",
"CPU times: user 1min 16s, sys: 1.12 s, total: 1min 17s\n",
"Wall time: 5.68 s\n"
]
}
],
"source": [
"%%time\n",
"dtrain = xgb.DMatrix(data=X_train, label=y_train)\n",
"dvalid = xgb.DMatrix(data=X_test, label=y_test)\n",
"dtest = xgb.DMatrix(data=Xt)\n",
"watchlist = [(dvalid, 'eval'), (dtrain, 'train')]\n",
"clf = xgb.train(gpu_params, dtrain=dtrain,\n",
" num_boost_round=60,evals=watchlist,\n",
" feval=func_loss,early_stopping_rounds=10,\n",
" verbose_eval=1000)\n",
"yp = clf.predict(dvalid)\n",
"loss = multi_weighted_logloss(y_test, yp, classes, class_weights)\n",
"ysub = clf.predict(dtest)\n",
"print('validation loss %.4f'%loss)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"we achieve 4.665 speedup!\n"
]
}
],
"source": [
"speedup = 26.5/5.68\n",
"print(\"we achieve %.3f speedup!\"%speedup)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"we achieve 24.594 speedup in total!\n"
]
}
],
"source": [
"total_speedup = (10*60 + 32 + 3*60 + 45 + 1*60 + 57 + 26.5)/(4.8+18.7+11.5+5.68)\n",
"print(\"we achieve %.3f speedup in total!\"%total_speedup)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"40.68"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"4.8+18.7+11.5+5.68"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment