Last active
January 16, 2019 20:11
-
-
Save daxiongshu/57d079b8233bcfa25787ca70649cd11a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import cudf as gd\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import math\n", | |
"import xgboost as xgb\n", | |
"from functools import partial\n", | |
"from sklearn.preprocessing import LabelEncoder\n", | |
"from sklearn.model_selection import train_test_split\n", | |
"import os\n", | |
"import time\n", | |
"os.environ['CUDA_VISIBLE_DEVICES'] = '0'\n", | |
"import warnings\n", | |
"warnings.filterwarnings(\"ignore\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"GPU_MEMORY = 32 # GB. please change it accordingly" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"TEST_ROWS = 453653104 # number of rows in test data\n", | |
"# no skip if your gpu has 32 GB memory\n", | |
"# otherwise, skip rows porportionally\n", | |
"SKIP_ROWS = int((1 - GPU_MEMORY/32.0)*TEST_ROWS) " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Functions" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def multi_weighted_logloss(y_true, y_preds, classes, class_weights):\n", | |
" \"\"\"\n", | |
" refactor from\n", | |
" @author olivier https://www.kaggle.com/ogrellier\n", | |
" multi logloss for PLAsTiCC challenge\n", | |
" \"\"\"\n", | |
" y_p = y_preds.reshape(y_true.shape[0], len(classes), order='F')\n", | |
" # Trasform y_true in dummies\n", | |
" y_ohe = pd.get_dummies(y_true)\n", | |
" # Normalize rows and limit y_preds to 1e-15, 1-1e-15\n", | |
" y_p = np.clip(a=y_p, a_min=1e-15, a_max=1 - 1e-15)\n", | |
" # Transform to log\n", | |
" y_p_log = np.log(y_p)\n", | |
" # Get the log for ones, .values is used to drop the index of DataFrames\n", | |
" # Exclude class 99 for now, since there is no class99 in the training set\n", | |
" # we gave a special process for that class\n", | |
" y_log_ones = np.sum(y_ohe.values * y_p_log, axis=0)\n", | |
" # Get the number of positives for each class\n", | |
" nb_pos = y_ohe.sum(axis=0).values.astype(float)\n", | |
" # Weight average and divide by the number of positives\n", | |
" class_arr = np.array([class_weights[k] for k in sorted(class_weights.keys())])\n", | |
" y_w = y_log_ones * class_arr / nb_pos\n", | |
"\n", | |
" loss = - np.sum(y_w) / np.sum(class_arr)\n", | |
" return loss\n", | |
"\n", | |
"def xgb_multi_weighted_logloss(y_predicted, y_true, classes, class_weights):\n", | |
" loss = multi_weighted_logloss(y_true.get_label(), y_predicted, \n", | |
" classes, class_weights)\n", | |
" return 'wloss', loss" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def ravel_column_names(cols):\n", | |
" d0 = cols.get_level_values(0)\n", | |
" d1 = cols.get_level_values(1)\n", | |
" return [\"%s_%s\"%(i,j) for i,j in zip(d0,d1)]\n", | |
" \n", | |
"def etl_cpu(df,df_meta):\n", | |
" df['flux_ratio_sq'] = np.power(df['flux'] / df['flux_err'], 2.0)\n", | |
" df['flux_by_flux_ratio_sq'] = df['flux'] * df['flux_ratio_sq']\n", | |
" aggs = {\n", | |
" 'passband': ['mean'], \n", | |
" 'flux': ['min', 'max', 'mean'],\n", | |
" 'flux_err': ['min', 'max', 'mean'],\n", | |
" 'detected': ['mean'],\n", | |
" 'mjd':['max','min'],\n", | |
" 'flux_ratio_sq':['sum'],\n", | |
" 'flux_by_flux_ratio_sq':['sum'],\n", | |
" }\n", | |
" agg_df = df.groupby('object_id').agg(aggs)\n", | |
" agg_df.columns = ravel_column_names(agg_df.columns)\n", | |
" \n", | |
" agg_df['flux_diff'] = agg_df['flux_max'] - agg_df['flux_min']\n", | |
" agg_df['flux_dif2'] = (agg_df['flux_max'] - agg_df['flux_min']) / agg_df['flux_mean']\n", | |
" agg_df['flux_w_mean'] = agg_df['flux_by_flux_ratio_sq_sum'] / agg_df['flux_ratio_sq_sum']\n", | |
" agg_df['flux_dif3'] = (agg_df['flux_max'] - agg_df['flux_min']) / agg_df['flux_w_mean']\n", | |
" \n", | |
" agg_df['mjd_diff'] = agg_df['mjd_max'] - agg_df['mjd_min']\n", | |
" agg_df = agg_df.drop(['mjd_max','mjd_min'],axis=1)\n", | |
" \n", | |
" agg_df = agg_df.reset_index()\n", | |
" df_meta = df_meta.drop(['ra','decl','gal_l','gal_b'],axis=1)\n", | |
" df_meta = df_meta.merge(agg_df,on='object_id',how='left')\n", | |
" return df_meta" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# To save GPU memory, we drop the column as soon as it is done with groupby\n", | |
"# \n", | |
"# this hits performance a little but avoids GPU OOM.\n", | |
"def groupby_aggs(df,aggs,col):\n", | |
" res = None\n", | |
" for i,j in aggs.items():\n", | |
" for k in j:\n", | |
" #print(i,k)\n", | |
" tmp = df.groupby(col).agg({i:[k]})\n", | |
" if res is None:\n", | |
" res = tmp\n", | |
" else:\n", | |
" res = res.merge(tmp,on=[col],how='left')\n", | |
" df.drop_column(i)\n", | |
" return res\n", | |
"\n", | |
"def etl_gpu(df,df_meta):\n", | |
" aggs = {\n", | |
" 'passband': ['mean'], \n", | |
" 'detected': ['mean'],\n", | |
" 'mjd':['max','min'],\n", | |
" }\n", | |
" agg_df = groupby_aggs(df,aggs,'object_id')\n", | |
" # at this step, columns ['passband','detected','mjd'] are deleted \n", | |
" \n", | |
" df['flux_ratio_sq'] = df['flux'] / df['flux_err']\n", | |
" df['flux_ratio_sq'] = df['flux_ratio_sq'].applymap(lambda x: math.pow(x,2))\n", | |
" df['flux_by_flux_ratio_sq'] = df['flux'] * df['flux_ratio_sq']\n", | |
" \n", | |
" aggs2 = {\n", | |
" 'flux_ratio_sq':['sum'],\n", | |
" 'flux_by_flux_ratio_sq':['sum'],\n", | |
" 'flux': ['min', 'max', 'mean'],\n", | |
" 'flux_err': ['min', 'max', 'mean'],\n", | |
" }\n", | |
" agg_df2 = groupby_aggs(df,aggs2,'object_id')\n", | |
" agg_df = agg_df.merge(agg_df2,on=['object_id'],how='left')\n", | |
" del agg_df2\n", | |
"\n", | |
" agg_df['flux_diff'] = agg_df['max_flux'] - agg_df['min_flux']\n", | |
" agg_df['flux_dif2'] = (agg_df['max_flux'] - agg_df['min_flux']) / agg_df['mean_flux']\n", | |
" agg_df['flux_w_mean'] = agg_df['sum_flux_by_flux_ratio_sq'] / agg_df['sum_flux_ratio_sq']\n", | |
" agg_df['flux_dif3'] = (agg_df['max_flux'] - agg_df['min_flux']) / agg_df['flux_w_mean']\n", | |
" \n", | |
" agg_df['mjd_diff'] = agg_df['max_mjd'] - agg_df['min_mjd']\n", | |
" agg_df.drop_column('max_mjd')\n", | |
" agg_df.drop_column('min_mjd')\n", | |
" \n", | |
" for col in ['ra','decl','gal_l','gal_b']:\n", | |
" df_meta.drop_column(col)\n", | |
" \n", | |
" df_meta = df_meta.merge(agg_df,on=['object_id'],how='left')\n", | |
" return df_meta" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Load data for ETL part 1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# please download data from https://www.kaggle.com/c/PLAsTiCC-2018/data\n", | |
"PATH = '../lsst/input'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 13.4 s, sys: 8.29 s, total: 21.7 s\n", | |
"Wall time: 21.7 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"# read data on gpu\n", | |
"ts_cols = ['object_id', 'mjd', 'passband', 'flux', 'flux_err', 'detected']\n", | |
"ts_dtypes = ['int32', 'float32', 'int32', 'float32','float32','int32']\n", | |
"\n", | |
"train_gd = gd.read_csv('%s/training_set.csv'%PATH,\n", | |
" names=ts_cols,dtype=ts_dtypes,skiprows=1)\n", | |
"test_gd = gd.read_csv('%s/test_set.csv'%PATH,\n", | |
" names=ts_cols,dtype=ts_dtypes,skiprows=1+SKIP_ROWS) # skip the header" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# ETL with 120x ~ 140x speedup" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 8 ms, sys: 8 ms, total: 16 ms\n", | |
"Wall time: 10 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"# to save memory, we need to move dataframe to cpu and only keep the columns we need\n", | |
"test_gd = test_gd[['object_id','flux']]\n", | |
"train_gd = train_gd[['object_id','flux']]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 9.65 s, sys: 7.06 s, total: 16.7 s\n", | |
"Wall time: 2.3 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"test = test_gd.to_pandas()\n", | |
"train = train_gd.to_pandas()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from cudf_workaround import cudf_groupby_aggs\n", | |
"\"\"\"\n", | |
"def cudf_groupby_aggs(df,group_id_col,aggs):\n", | |
" \n", | |
" Parameters\n", | |
" ----------\n", | |
" df : cudf dataframe\n", | |
" dataframe to be grouped\n", | |
" group_id_col : string\n", | |
" name of the column which is used as the key of the group\n", | |
" aggs : dictionary\n", | |
" key is the name of column for which aggregation is calculated\n", | |
" values is the name of function for aggregation\n", | |
" Returns\n", | |
" -------\n", | |
" dg : cudf dataframe\n", | |
" result of groupby aggregation\n", | |
"\"\"\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 10.3 s, sys: 1.04 s, total: 11.3 s\n", | |
"Wall time: 4.83 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"# GPU\n", | |
"aggs = {'flux':['skew']}\n", | |
"test_gd = cudf_groupby_aggs(test_gd,group_id_col='object_id',aggs=aggs)\n", | |
"train_gd = cudf_groupby_aggs(train_gd,group_id_col='object_id',aggs=aggs)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 10min 22s, sys: 24.8 s, total: 10min 47s\n", | |
"Wall time: 10min 32s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"# CPU\n", | |
"test = test.groupby('object_id').agg(aggs)\n", | |
"train = train.groupby('object_id').agg(aggs)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 42, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"we achieve 130.849 speedup!\n" | |
] | |
} | |
], | |
"source": [ | |
"speedup = (10*60 + 32)/4.83\n", | |
"print(\"we achieve %.3f speedup!\"%speedup)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 8 ms, sys: 32 ms, total: 40 ms\n", | |
"Wall time: 39.8 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"test_gd = test_gd.sort_values(by='object_id')\n", | |
"train_gd = train_gd.sort_values(by='object_id')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 7.02 s, sys: 24 ms, total: 7.04 s\n", | |
"Wall time: 176 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"test.columns = ['skew_flux']\n", | |
"test = test.reset_index()\n", | |
"test = test.sort_values(by='object_id')\n", | |
"train.columns = ['skew_flux']\n", | |
"train = train.reset_index()\n", | |
"train = train.sort_values(by='object_id')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"test\n", | |
"object_id, rmse 0.000000\n", | |
"skew_flux, rmse 0.000000\n", | |
"train\n", | |
"object_id, rmse 0.000000\n", | |
"skew_flux, rmse 0.000001\n" | |
] | |
} | |
], | |
"source": [ | |
"def rmse(a,b):\n", | |
" return np.mean((a-b)**2)**0.5\n", | |
"print('test')\n", | |
"for col in test.columns:\n", | |
" if col in test_gd.columns:\n", | |
" print(\"%s, rmse %.6f\"%(col,rmse(test[col].values,test_gd[col].to_pandas().values)))\n", | |
"print('train')\n", | |
"for col in train.columns:\n", | |
" if col in train_gd.columns:\n", | |
" print(\"%s, rmse %.6f\"%(col,rmse(train[col].values,train_gd[col].to_pandas().values)))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"3492890 3492890\n" | |
] | |
} | |
], | |
"source": [ | |
"test_flux_skew_gd = test_gd\n", | |
"test_flux_skew = test\n", | |
"train_flux_skew_gd = train_gd\n", | |
"train_flux_skew = train\n", | |
"print(len(test_gd),len(test))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Load data with 11x speedup" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 4min 2s, sys: 50.7 s, total: 4min 52s\n", | |
"Wall time: 3min 45s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"# read data on cpu\n", | |
"test = pd.read_csv('%s/test_set.csv'%PATH)\n", | |
"test_meta = pd.read_csv('%s/test_set_metadata.csv'%PATH)\n", | |
"\n", | |
"train = pd.read_csv('%s/training_set.csv'%PATH)\n", | |
"train_meta = pd.read_csv('%s/training_set_metadata.csv'%PATH)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 19.7 s, sys: 5.46 s, total: 25.2 s\n", | |
"Wall time: 18.7 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"# read data on gpu\n", | |
"ts_cols = ['object_id', 'mjd', 'passband', 'flux', 'flux_err', 'detected']\n", | |
"ts_dtypes = ['int32', 'float32', 'int32', 'float32','float32','int32']\n", | |
"\n", | |
"test_gd = gd.read_csv('%s/test_set.csv'%PATH,\n", | |
" names=ts_cols,dtype=ts_dtypes,skiprows=1+SKIP_ROWS) # skip the header\n", | |
"train_gd = gd.read_csv('%s/training_set.csv'%PATH,\n", | |
" names=ts_cols,dtype=ts_dtypes,skiprows=1)\n", | |
"\n", | |
"cols = ['object_id', 'ra', 'decl', 'gal_l', 'gal_b', 'ddf',\n", | |
" 'hostgal_specz', 'hostgal_photoz', 'hostgal_photoz_err', \n", | |
" 'distmod','mwebv', 'target']\n", | |
"dtypes = ['int32']+['float32']*4+['int32']+['float32']*5+['int32']\n", | |
"\n", | |
"train_meta_gd = gd.read_csv('%s/training_set_metadata.csv'%PATH,\n", | |
" names=cols,dtype=dtypes,skiprows=1)\n", | |
"del cols[-1],dtypes[-1]\n", | |
"test_meta_gd = gd.read_csv('%s/test_set_metadata.csv'%PATH,\n", | |
" names=cols,dtype=dtypes,skiprows=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 43, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"we achieve 12.032 speedup!\n" | |
] | |
} | |
], | |
"source": [ | |
"speedup = (3*60 + 45)/18.7\n", | |
"print(\"we achieve %.3f speedup!\"%speedup)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# ETL with 9x ~ 12x speedup " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1:int32\n", | |
"2:int32\n", | |
"3:int32\n", | |
"1:int32\n", | |
"1:int32\n", | |
"2:int32\n", | |
"3:float32\n", | |
"1:int32\n", | |
"1:int32\n", | |
"1:float32\n", | |
"2:int32\n", | |
"3:float32\n", | |
"1:float32\n", | |
"2:int32\n", | |
"3:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"2:int32\n", | |
"3:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"2:int32\n", | |
"3:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"2:int32\n", | |
"3:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"2:int32\n", | |
"3:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"2:int32\n", | |
"3:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"2:int32\n", | |
"3:float32\n", | |
"1:int32\n", | |
"1:int32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"2:int32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"1:int32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:int32\n", | |
"2:int32\n", | |
"3:int32\n", | |
"3:int32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"1:int32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:int32\n", | |
"1:int32\n", | |
"1:int32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"2:int32\n", | |
"3:float32\n", | |
"1:int32\n", | |
"2:int32\n", | |
"3:int32\n", | |
"1:int32\n", | |
"1:int32\n", | |
"2:int32\n", | |
"3:float32\n", | |
"1:int32\n", | |
"1:int32\n", | |
"1:float32\n", | |
"2:int32\n", | |
"3:float32\n", | |
"1:float32\n", | |
"2:int32\n", | |
"3:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"2:int32\n", | |
"3:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"2:int32\n", | |
"3:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"2:int32\n", | |
"3:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"2:int32\n", | |
"3:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"2:int32\n", | |
"3:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"2:int32\n", | |
"3:float32\n", | |
"1:int32\n", | |
"1:int32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"2:int32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"1:int32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"2:int32\n", | |
"3:int32\n", | |
"3:int32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"3:float32\n", | |
"1:int32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:int32\n", | |
"1:int32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"1:float32\n", | |
"2:int32\n", | |
"3:float32\n", | |
"CPU times: user 4.29 s, sys: 3.08 s, total: 7.37 s\n", | |
"Wall time: 11.5 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"# GPU\n", | |
"train_final_gd = etl_gpu(train_gd,train_meta_gd)\n", | |
"train_final_gd = train_final_gd.merge(train_flux_skew_gd,on=['object_id'],how='left')\n", | |
"test_final_gd = etl_gpu(test_gd,test_meta_gd)\n", | |
"del test_gd,test_meta_gd\n", | |
"test_final_gd = test_final_gd.merge(test_flux_skew_gd,on=['object_id'],how='left')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 3min 56s, sys: 1min 42s, total: 5min 39s\n", | |
"Wall time: 1min 57s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"#CPU\n", | |
"train_final = etl_cpu(train,train_meta)\n", | |
"train_final = train_final.merge(train_flux_skew,on=['object_id'],how='left')\n", | |
"test_final = etl_cpu(test,test_meta)\n", | |
"test_final = test_final.merge(test_flux_skew,on=['object_id'],how='left')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 44, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"we achieve 10.174 speedup!\n" | |
] | |
} | |
], | |
"source": [ | |
"speedup = (1*60 + 57)/11.5\n", | |
"print(\"we achieve %.3f speedup!\"%speedup)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# train and validation with 5x speedup" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[ 6 15 16 42 52 53 62 64 65 67 88 90 92 95]\n" | |
] | |
} | |
], | |
"source": [ | |
"# CPU\n", | |
"X = train_final.drop(['object_id','target'],axis=1).values\n", | |
"y = train_final['target']\n", | |
"Xt = test_final.drop(['object_id'],axis=1).values\n", | |
"assert X.shape[1] == Xt.shape[1]\n", | |
"classes = sorted(y.unique()) \n", | |
"# Taken from Giba's topic : https://www.kaggle.com/titericz\n", | |
"# https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194\n", | |
"# with Kyle Boone's post https://www.kaggle.com/kyleboone\n", | |
"class_weights = {c: 1 for c in classes}\n", | |
"class_weights.update({c:2 for c in [64, 15]})\n", | |
"\n", | |
"lbl = LabelEncoder()\n", | |
"y = lbl.fit_transform(y)\n", | |
"print(lbl.classes_)\n", | |
"\n", | |
"X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,stratify=y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 38, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"cpu_params = {\n", | |
" 'objective': 'multi:softprob', \n", | |
" 'tree_method': 'hist', \n", | |
" 'nthread': 16, \n", | |
" 'num_class':14,\n", | |
" 'max_depth': 7, \n", | |
" 'silent':1,\n", | |
" 'subsample':0.7,\n", | |
" 'colsample_bytree': 0.7,}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 39, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"func_loss = partial(xgb_multi_weighted_logloss, \n", | |
" classes=classes, \n", | |
" class_weights=class_weights)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 40, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[02:47:23] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.\n", | |
"[0]\teval-merror:0.326115\ttrain-merror:0.272122\teval-wloss:2.03443\ttrain-wloss:1.88621\n", | |
"Multiple eval metrics have been passed: 'train-wloss' will be used for early stopping.\n", | |
"\n", | |
"Will train until train-wloss hasn't improved in 10 rounds.\n", | |
"[59]\teval-merror:0.276433\ttrain-merror:0.001982\teval-wloss:1.33349\ttrain-wloss:0.091674\n", | |
"validation loss 1.3335\n", | |
"CPU times: user 6min 38s, sys: 1.43 s, total: 6min 40s\n", | |
"Wall time: 26.5 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"dtrain = xgb.DMatrix(data=X_train, label=y_train)\n", | |
"dvalid = xgb.DMatrix(data=X_test, label=y_test)\n", | |
"dtest = xgb.DMatrix(data=Xt)\n", | |
"watchlist = [(dvalid, 'eval'), (dtrain, 'train')]\n", | |
"clf = xgb.train(cpu_params, dtrain=dtrain,\n", | |
" num_boost_round=60,evals=watchlist,\n", | |
" feval=func_loss,early_stopping_rounds=10,\n", | |
" verbose_eval=1000)\n", | |
"yp = clf.predict(dvalid)\n", | |
"loss = multi_weighted_logloss(y_test, yp, classes, class_weights)\n", | |
"ysub = clf.predict(dtest)\n", | |
"print('validation loss %.4f'%loss)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# GPU\n", | |
"y = train_final_gd['target'].to_array()\n", | |
"y = lbl.fit_transform(y)\n", | |
"for col in ['object_id','target']:\n", | |
" train_final_gd.drop_column(col)\n", | |
"for col in train_final_gd.columns:\n", | |
" train_final_gd[col] = train_final_gd[col].fillna(0).astype('float32')\n", | |
"\n", | |
"\n", | |
"for col in ['object_id']:\n", | |
" test_final_gd.drop_column(col)\n", | |
"for col in test_final_gd.columns:\n", | |
" test_final_gd[col] = test_final_gd[col].fillna(0).astype('float32')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"cols = [i for i in test_final_gd.columns]\n", | |
"X = train_final_gd[cols].as_matrix()\n", | |
"Xt = test_final_gd.as_matrix()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,stratify=y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# GPU\n", | |
"gpu_params = cpu_params.copy()\n", | |
"gpu_params.update({'objective': 'multi:softprob',\n", | |
" 'tree_method': 'gpu_hist', \n", | |
" })" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[0]\teval-merror:0.338854\ttrain-merror:0.296899\teval-wloss:1.98145\ttrain-wloss:1.8993\n", | |
"Multiple eval metrics have been passed: 'train-wloss' will be used for early stopping.\n", | |
"\n", | |
"Will train until train-wloss hasn't improved in 10 rounds.\n", | |
"[59]\teval-merror:0.26242\ttrain-merror:0.002832\teval-wloss:1.13939\ttrain-wloss:0.102957\n", | |
"validation loss 1.1394\n", | |
"CPU times: user 1min 16s, sys: 1.12 s, total: 1min 17s\n", | |
"Wall time: 5.68 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"dtrain = xgb.DMatrix(data=X_train, label=y_train)\n", | |
"dvalid = xgb.DMatrix(data=X_test, label=y_test)\n", | |
"dtest = xgb.DMatrix(data=Xt)\n", | |
"watchlist = [(dvalid, 'eval'), (dtrain, 'train')]\n", | |
"clf = xgb.train(gpu_params, dtrain=dtrain,\n", | |
" num_boost_round=60,evals=watchlist,\n", | |
" feval=func_loss,early_stopping_rounds=10,\n", | |
" verbose_eval=1000)\n", | |
"yp = clf.predict(dvalid)\n", | |
"loss = multi_weighted_logloss(y_test, yp, classes, class_weights)\n", | |
"ysub = clf.predict(dtest)\n", | |
"print('validation loss %.4f'%loss)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 41, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"we achieve 4.665 speedup!\n" | |
] | |
} | |
], | |
"source": [ | |
"speedup = 26.5/5.68\n", | |
"print(\"we achieve %.3f speedup!\"%speedup)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 45, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"we achieve 24.594 speedup in total!\n" | |
] | |
} | |
], | |
"source": [ | |
"total_speedup = (10*60 + 32 + 3*60 + 45 + 1*60 + 57 + 26.5)/(4.8+18.7+11.5+5.68)\n", | |
"print(\"we achieve %.3f speedup in total!\"%total_speedup)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 46, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"40.68" | |
] | |
}, | |
"execution_count": 46, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"4.8+18.7+11.5+5.68" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment