-
-
Save daxiongshu/5b83d85b913d167a7cba3ffecd2a6dd0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "54bcbfa3-0eec-4fcd-81d6-6260500c917f", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import os" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "c06cfa80-14c8-4b30-a463-ec43de8f5e1b", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"('23.04.00', '1.7.1')" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import cudf\n", | |
"import cupy\n", | |
"from tqdm import tqdm\n", | |
"import numpy as np\n", | |
"import gc\n", | |
"import xgboost as xgb\n", | |
"from utils import amex_metric_np\n", | |
"from pathlib import Path\n", | |
"\n", | |
"cudf.__version__, xgb.__version__" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "9c237176-3f3b-4943-ab22-b1bae1e72942", | |
"metadata": {}, | |
"source": [ | |
"# Please register kaggle and install kaggle API by: \n", | |
"- `pip install kaggle`\n", | |
"- complete [authentication](https://www.kaggle.com/docs/api)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "e88971c2-5409-463d-91d6-cf4e42d0d881", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"PATH = '/raid/data/ml/kaggle/amex'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "fea38251-7b2b-48b6-a8e7-0d5fc4030c4d", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [], | |
"source": [ | |
"Path(PATH).mkdir(parents=True,exist_ok=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "b65adb64-7f63-4418-80a1-40fb226327e9", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"cmd = f'kaggle datasets download -d raddar/amex-data-integer-dtypes-parquet-format -p {PATH}'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "375f1c13-dccb-49f8-ad45-af6ccf60ab30", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Downloading amex-data-integer-dtypes-parquet-format.zip to /raid/data/ml/kaggle/amex\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|█████████▉| 4.06G/4.07G [01:13<00:00, 63.0MB/s]" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|██████████| 4.07G/4.07G [01:13<00:00, 59.3MB/s]\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"0" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"os.system(cmd)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "8a7e4d9e-9059-4e6a-ad4a-a1630783fc3b", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['amex-data-integer-dtypes-parquet-format.zip']" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"os.listdir(PATH)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "88d87bee-0287-44b9-aa3e-b033cb72ea57", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Archive: amex-data-integer-dtypes-parquet-format.zip\n", | |
" inflating: test.parquet \n", | |
" inflating: train.parquet \n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"0" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"cmd = f'cd {PATH} && unzip amex-data-integer-dtypes-parquet-format.zip'\n", | |
"os.system(cmd)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "0216a228-c355-48f2-b038-caa9412fbaa8", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['amex-data-integer-dtypes-parquet-format.zip',\n", | |
" 'train.parquet',\n", | |
" 'test.parquet']" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"os.listdir(PATH)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "f5a4489e-08d9-410a-b7b4-eec77ca1da73", | |
"metadata": {}, | |
"source": [ | |
"# Basic EDA" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "e3fb4208-5541-430f-be79-99915536c786", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(5531451, 190)\n", | |
"CPU times: user 1.26 s, sys: 1.31 s, total: 2.57 s\n", | |
"Wall time: 2.6 s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>customer_ID</th>\n", | |
" <th>S_2</th>\n", | |
" <th>P_2</th>\n", | |
" <th>D_39</th>\n", | |
" <th>B_1</th>\n", | |
" <th>B_2</th>\n", | |
" <th>R_1</th>\n", | |
" <th>S_3</th>\n", | |
" <th>D_41</th>\n", | |
" <th>B_3</th>\n", | |
" <th>...</th>\n", | |
" <th>D_136</th>\n", | |
" <th>D_137</th>\n", | |
" <th>D_138</th>\n", | |
" <th>D_139</th>\n", | |
" <th>D_140</th>\n", | |
" <th>D_141</th>\n", | |
" <th>D_142</th>\n", | |
" <th>D_143</th>\n", | |
" <th>D_144</th>\n", | |
" <th>D_145</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...</td>\n", | |
" <td>2017-03-09</td>\n", | |
" <td>0.938469</td>\n", | |
" <td>0</td>\n", | |
" <td>0.008724</td>\n", | |
" <td>1.006838</td>\n", | |
" <td>0.009228</td>\n", | |
" <td>0.124035</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.004709</td>\n", | |
" <td>...</td>\n", | |
" <td>-1</td>\n", | |
" <td>-1</td>\n", | |
" <td>-1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0.0</td>\n", | |
" <td><NA></td>\n", | |
" <td>0</td>\n", | |
" <td>0.000610</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...</td>\n", | |
" <td>2017-04-07</td>\n", | |
" <td>0.936665</td>\n", | |
" <td>0</td>\n", | |
" <td>0.004923</td>\n", | |
" <td>1.000653</td>\n", | |
" <td>0.006151</td>\n", | |
" <td>0.126750</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.002714</td>\n", | |
" <td>...</td>\n", | |
" <td>-1</td>\n", | |
" <td>-1</td>\n", | |
" <td>-1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0.0</td>\n", | |
" <td><NA></td>\n", | |
" <td>0</td>\n", | |
" <td>0.005492</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...</td>\n", | |
" <td>2017-05-28</td>\n", | |
" <td>0.954180</td>\n", | |
" <td>3</td>\n", | |
" <td>0.021655</td>\n", | |
" <td>1.009672</td>\n", | |
" <td>0.006815</td>\n", | |
" <td>0.123977</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.009423</td>\n", | |
" <td>...</td>\n", | |
" <td>-1</td>\n", | |
" <td>-1</td>\n", | |
" <td>-1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0.0</td>\n", | |
" <td><NA></td>\n", | |
" <td>0</td>\n", | |
" <td>0.006986</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...</td>\n", | |
" <td>2017-06-13</td>\n", | |
" <td>0.960384</td>\n", | |
" <td>0</td>\n", | |
" <td>0.013683</td>\n", | |
" <td>1.002700</td>\n", | |
" <td>0.001373</td>\n", | |
" <td>0.117169</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.005531</td>\n", | |
" <td>...</td>\n", | |
" <td>-1</td>\n", | |
" <td>-1</td>\n", | |
" <td>-1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0.0</td>\n", | |
" <td><NA></td>\n", | |
" <td>0</td>\n", | |
" <td>0.006527</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...</td>\n", | |
" <td>2017-07-16</td>\n", | |
" <td>0.947248</td>\n", | |
" <td>0</td>\n", | |
" <td>0.015193</td>\n", | |
" <td>1.000727</td>\n", | |
" <td>0.007605</td>\n", | |
" <td>0.117325</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.009312</td>\n", | |
" <td>...</td>\n", | |
" <td>-1</td>\n", | |
" <td>-1</td>\n", | |
" <td>-1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0.0</td>\n", | |
" <td><NA></td>\n", | |
" <td>0</td>\n", | |
" <td>0.008126</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows × 190 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" customer_ID S_2 P_2 \\\n", | |
"0 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... 2017-03-09 0.938469 \n", | |
"1 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... 2017-04-07 0.936665 \n", | |
"2 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... 2017-05-28 0.954180 \n", | |
"3 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... 2017-06-13 0.960384 \n", | |
"4 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... 2017-07-16 0.947248 \n", | |
"\n", | |
" D_39 B_1 B_2 R_1 S_3 D_41 B_3 ... D_136 \\\n", | |
"0 0 0.008724 1.006838 0.009228 0.124035 0.0 0.004709 ... -1 \n", | |
"1 0 0.004923 1.000653 0.006151 0.126750 0.0 0.002714 ... -1 \n", | |
"2 3 0.021655 1.009672 0.006815 0.123977 0.0 0.009423 ... -1 \n", | |
"3 0 0.013683 1.002700 0.001373 0.117169 0.0 0.005531 ... -1 \n", | |
"4 0 0.015193 1.000727 0.007605 0.117325 0.0 0.009312 ... -1 \n", | |
"\n", | |
" D_137 D_138 D_139 D_140 D_141 D_142 D_143 D_144 D_145 \n", | |
"0 -1 -1 0 0 0.0 <NA> 0 0.000610 0 \n", | |
"1 -1 -1 0 0 0.0 <NA> 0 0.005492 0 \n", | |
"2 -1 -1 0 0 0.0 <NA> 0 0.006986 0 \n", | |
"3 -1 -1 0 0 0.0 <NA> 0 0.006527 0 \n", | |
"4 -1 -1 0 0 0.0 <NA> 0 0.008126 0 \n", | |
"\n", | |
"[5 rows x 190 columns]" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"train = cudf.read_parquet(f'{PATH}/train.parquet')\n", | |
"print(train.shape)\n", | |
"train.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "2727b9b8-59e8-40d9-90ee-4889bf35a52b", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 5.63 ms, sys: 9.56 ms, total: 15.2 ms\n", | |
"Wall time: 14.1 ms\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>num_profiles</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>customer_ID</th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>c761f5f5b15e563daa67f0a41c3ec2a870d3c9daaadf0cd11dd808d3aaa82c46</th>\n", | |
" <td>13</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>e16b5594d9dce9ebd2f8e0d7074391736b2641afa9e349f67a53f7cc780c120b</th>\n", | |
" <td>13</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8c846c26e1f1d4afa04977155c41bc3b6bb77c72efc5db3f592ec3d72f12cfdc</th>\n", | |
" <td>13</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>463e8a9b5b0161764bbbb0b5b58956bb8ebff6244219b21ac257a07364fa8dd9</th>\n", | |
" <td>13</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>92bbe3e2a159bcc838b86241471eb14153c8d712b6647feffbe49d5266cdfd3f</th>\n", | |
" <td>13</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" num_profiles\n", | |
"customer_ID \n", | |
"c761f5f5b15e563daa67f0a41c3ec2a870d3c9daaadf0cd... 13\n", | |
"e16b5594d9dce9ebd2f8e0d7074391736b2641afa9e349f... 13\n", | |
"8c846c26e1f1d4afa04977155c41bc3b6bb77c72efc5db3... 13\n", | |
"463e8a9b5b0161764bbbb0b5b58956bb8ebff6244219b21... 13\n", | |
"92bbe3e2a159bcc838b86241471eb14153c8d712b6647fe... 13" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"count_df = train.groupby('customer_ID').size().to_frame('num_profiles')\n", | |
"count_df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "4da4f08f-8727-45a8-bb88-2db32fa4220a", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"13" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"count_df.num_profiles.max()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "13e12b4c-0af3-4b73-ac21-d78ac271d36a", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 12.6 ms, sys: 4.63 ms, total: 17.2 ms\n", | |
"Wall time: 15.7 ms\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>customer_ID</th>\n", | |
" <th>S_2</th>\n", | |
" <th>P_2</th>\n", | |
" <th>D_39</th>\n", | |
" <th>B_1</th>\n", | |
" <th>B_2</th>\n", | |
" <th>R_1</th>\n", | |
" <th>S_3</th>\n", | |
" <th>D_41</th>\n", | |
" <th>B_3</th>\n", | |
" <th>...</th>\n", | |
" <th>D_136</th>\n", | |
" <th>D_137</th>\n", | |
" <th>D_138</th>\n", | |
" <th>D_139</th>\n", | |
" <th>D_140</th>\n", | |
" <th>D_141</th>\n", | |
" <th>D_142</th>\n", | |
" <th>D_143</th>\n", | |
" <th>D_144</th>\n", | |
" <th>D_145</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...</td>\n", | |
" <td>2017-03-09</td>\n", | |
" <td>0.938469</td>\n", | |
" <td>0</td>\n", | |
" <td>0.008724</td>\n", | |
" <td>1.006838</td>\n", | |
" <td>0.009228</td>\n", | |
" <td>0.124035</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.004709</td>\n", | |
" <td>...</td>\n", | |
" <td>-1</td>\n", | |
" <td>-1</td>\n", | |
" <td>-1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0.0</td>\n", | |
" <td><NA></td>\n", | |
" <td>0</td>\n", | |
" <td>0.000610</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...</td>\n", | |
" <td>2017-04-07</td>\n", | |
" <td>0.936665</td>\n", | |
" <td>0</td>\n", | |
" <td>0.004923</td>\n", | |
" <td>1.000653</td>\n", | |
" <td>0.006151</td>\n", | |
" <td>0.126750</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.002714</td>\n", | |
" <td>...</td>\n", | |
" <td>-1</td>\n", | |
" <td>-1</td>\n", | |
" <td>-1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0.0</td>\n", | |
" <td><NA></td>\n", | |
" <td>0</td>\n", | |
" <td>0.005492</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...</td>\n", | |
" <td>2017-05-28</td>\n", | |
" <td>0.954180</td>\n", | |
" <td>3</td>\n", | |
" <td>0.021655</td>\n", | |
" <td>1.009672</td>\n", | |
" <td>0.006815</td>\n", | |
" <td>0.123977</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.009423</td>\n", | |
" <td>...</td>\n", | |
" <td>-1</td>\n", | |
" <td>-1</td>\n", | |
" <td>-1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0.0</td>\n", | |
" <td><NA></td>\n", | |
" <td>0</td>\n", | |
" <td>0.006986</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...</td>\n", | |
" <td>2017-06-13</td>\n", | |
" <td>0.960384</td>\n", | |
" <td>0</td>\n", | |
" <td>0.013683</td>\n", | |
" <td>1.002700</td>\n", | |
" <td>0.001373</td>\n", | |
" <td>0.117169</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.005531</td>\n", | |
" <td>...</td>\n", | |
" <td>-1</td>\n", | |
" <td>-1</td>\n", | |
" <td>-1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0.0</td>\n", | |
" <td><NA></td>\n", | |
" <td>0</td>\n", | |
" <td>0.006527</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...</td>\n", | |
" <td>2017-07-16</td>\n", | |
" <td>0.947248</td>\n", | |
" <td>0</td>\n", | |
" <td>0.015193</td>\n", | |
" <td>1.000727</td>\n", | |
" <td>0.007605</td>\n", | |
" <td>0.117325</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.009312</td>\n", | |
" <td>...</td>\n", | |
" <td>-1</td>\n", | |
" <td>-1</td>\n", | |
" <td>-1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0.0</td>\n", | |
" <td><NA></td>\n", | |
" <td>0</td>\n", | |
" <td>0.008126</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows × 190 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" customer_ID S_2 P_2 \\\n", | |
"0 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... 2017-03-09 0.938469 \n", | |
"1 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... 2017-04-07 0.936665 \n", | |
"2 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... 2017-05-28 0.954180 \n", | |
"3 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... 2017-06-13 0.960384 \n", | |
"4 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... 2017-07-16 0.947248 \n", | |
"\n", | |
" D_39 B_1 B_2 R_1 S_3 D_41 B_3 ... D_136 \\\n", | |
"0 0 0.008724 1.006838 0.009228 0.124035 0.0 0.004709 ... -1 \n", | |
"1 0 0.004923 1.000653 0.006151 0.126750 0.0 0.002714 ... -1 \n", | |
"2 3 0.021655 1.009672 0.006815 0.123977 0.0 0.009423 ... -1 \n", | |
"3 0 0.013683 1.002700 0.001373 0.117169 0.0 0.005531 ... -1 \n", | |
"4 0 0.015193 1.000727 0.007605 0.117325 0.0 0.009312 ... -1 \n", | |
"\n", | |
" D_137 D_138 D_139 D_140 D_141 D_142 D_143 D_144 D_145 \n", | |
"0 -1 -1 0 0 0.0 <NA> 0 0.000610 0 \n", | |
"1 -1 -1 0 0 0.0 <NA> 0 0.005492 0 \n", | |
"2 -1 -1 0 0 0.0 <NA> 0 0.006986 0 \n", | |
"3 -1 -1 0 0 0.0 <NA> 0 0.006527 0 \n", | |
"4 -1 -1 0 0 0.0 <NA> 0 0.008126 0 \n", | |
"\n", | |
"[5 rows x 190 columns]" | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"train['S_2'] = cudf.to_datetime(train['S_2'])\n", | |
"train.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "8f6fd282-8784-427b-9e5a-c341a13d91cd", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(numpy.datetime64('2017-03-01T00:00:00.000000000'),\n", | |
" numpy.datetime64('2018-03-31T00:00:00.000000000'))" | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"train.S_2.min(), train.S_2.max()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "6aefdd16-d078-4df4-9184-087bac6d22f9", | |
"metadata": {}, | |
"source": [ | |
"## Download the training data labels" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"id": "4e366bc8-657a-4e4e-a4fa-cd65da90e2ae", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Downloading train_labels.csv.zip to /raid/data/ml/kaggle/amex\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
" 31%|███ | 5.00M/16.2M [00:00<00:00, 29.6MB/s]" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|██████████| 16.2M/16.2M [00:00<00:00, 21.8MB/s]\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"0" | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"cmd = f'kaggle competitions download -c amex-default-prediction -f train_labels.csv -p {PATH}/'\n", | |
"os.system(cmd)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"id": "0c7fcdcd-936a-4580-9127-60fb130535f2", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Archive: train_labels.csv.zip\n", | |
" inflating: train_labels.csv \n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"0" | |
] | |
}, | |
"execution_count": 17, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"cmd = f'cd {PATH} && unzip train_labels.csv.zip'\n", | |
"os.system(cmd)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"id": "7ec73ed7-b457-49af-913a-c79e7f425f38", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(458913, 2)\n", | |
"CPU times: user 19.3 ms, sys: 11.2 ms, total: 30.5 ms\n", | |
"Wall time: 34.7 ms\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>customer_ID</th>\n", | |
" <th>target</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>00000fd6641609c6ece5454664794f0340ad84dddce9a2...</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>00001b22f846c82c51f6e3958ccd81970162bae8b007e8...</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>000041bdba6ecadd89a52d11886e8eaaec9325906c9723...</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" customer_ID target\n", | |
"0 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... 0\n", | |
"1 00000fd6641609c6ece5454664794f0340ad84dddce9a2... 0\n", | |
"2 00001b22f846c82c51f6e3958ccd81970162bae8b007e8... 0\n", | |
"3 000041bdba6ecadd89a52d11886e8eaaec9325906c9723... 0\n", | |
"4 00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a... 0" | |
] | |
}, | |
"execution_count": 18, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"trainl = cudf.read_csv(f'{PATH}/train_labels.csv')\n", | |
"print(trainl.shape)\n", | |
"trainl.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"id": "a586ed4d-f157-45c1-8a8f-e0b623aae80f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0 340085\n", | |
"1 118828\n", | |
"Name: target, dtype: int32" | |
] | |
}, | |
"execution_count": 19, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"trainl['target'].value_counts()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"id": "1a695ca4-ba9f-448c-b338-7b13733c5ec6", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(5531451, 191)\n", | |
"CPU times: user 49 ms, sys: 157 ms, total: 206 ms\n", | |
"Wall time: 205 ms\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>customer_ID</th>\n", | |
" <th>S_2</th>\n", | |
" <th>P_2</th>\n", | |
" <th>D_39</th>\n", | |
" <th>B_1</th>\n", | |
" <th>B_2</th>\n", | |
" <th>R_1</th>\n", | |
" <th>S_3</th>\n", | |
" <th>D_41</th>\n", | |
" <th>B_3</th>\n", | |
" <th>...</th>\n", | |
" <th>D_137</th>\n", | |
" <th>D_138</th>\n", | |
" <th>D_139</th>\n", | |
" <th>D_140</th>\n", | |
" <th>D_141</th>\n", | |
" <th>D_142</th>\n", | |
" <th>D_143</th>\n", | |
" <th>D_144</th>\n", | |
" <th>D_145</th>\n", | |
" <th>target</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0003b7891c49786441d138c01f55f1712df645989dcd2a...</td>\n", | |
" <td>2017-08-20</td>\n", | |
" <td>0.951668</td>\n", | |
" <td>0</td>\n", | |
" <td>0.021847</td>\n", | |
" <td>0.705988</td>\n", | |
" <td>0.001923</td>\n", | |
" <td>0.544475</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.040750</td>\n", | |
" <td>...</td>\n", | |
" <td>-1</td>\n", | |
" <td>-1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0.0</td>\n", | |
" <td><NA></td>\n", | |
" <td>0</td>\n", | |
" <td>0.002867</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>0003b7891c49786441d138c01f55f1712df645989dcd2a...</td>\n", | |
" <td>2017-09-27</td>\n", | |
" <td>0.949354</td>\n", | |
" <td>0</td>\n", | |
" <td>0.021467</td>\n", | |
" <td>0.519889</td>\n", | |
" <td>0.002993</td>\n", | |
" <td>0.505413</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.045911</td>\n", | |
" <td>...</td>\n", | |
" <td>-1</td>\n", | |
" <td>-1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0.0</td>\n", | |
" <td><NA></td>\n", | |
" <td>0</td>\n", | |
" <td>0.004061</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>0003b7891c49786441d138c01f55f1712df645989dcd2a...</td>\n", | |
" <td>2017-10-22</td>\n", | |
" <td>0.928322</td>\n", | |
" <td>0</td>\n", | |
" <td>0.017777</td>\n", | |
" <td>1.002783</td>\n", | |
" <td>0.007028</td>\n", | |
" <td>0.499947</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.027716</td>\n", | |
" <td>...</td>\n", | |
" <td>-1</td>\n", | |
" <td>-1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0.0</td>\n", | |
" <td><NA></td>\n", | |
" <td>0</td>\n", | |
" <td>0.005397</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>0003b7891c49786441d138c01f55f1712df645989dcd2a...</td>\n", | |
" <td>2017-11-01</td>\n", | |
" <td>0.929362</td>\n", | |
" <td>3</td>\n", | |
" <td>0.020861</td>\n", | |
" <td>0.811287</td>\n", | |
" <td>0.005151</td>\n", | |
" <td>0.505690</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.023649</td>\n", | |
" <td>...</td>\n", | |
" <td>-1</td>\n", | |
" <td>-1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0.0</td>\n", | |
" <td><NA></td>\n", | |
" <td>0</td>\n", | |
" <td>0.002780</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>0003b7891c49786441d138c01f55f1712df645989dcd2a...</td>\n", | |
" <td>2017-12-04</td>\n", | |
" <td>0.948366</td>\n", | |
" <td>6</td>\n", | |
" <td>0.037010</td>\n", | |
" <td>1.004685</td>\n", | |
" <td>0.006001</td>\n", | |
" <td>0.549692</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.019136</td>\n", | |
" <td>...</td>\n", | |
" <td>-1</td>\n", | |
" <td>-1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0.0</td>\n", | |
" <td><NA></td>\n", | |
" <td>0</td>\n", | |
" <td>0.005114</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows × 191 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" customer_ID S_2 P_2 \\\n", | |
"0 0003b7891c49786441d138c01f55f1712df645989dcd2a... 2017-08-20 0.951668 \n", | |
"1 0003b7891c49786441d138c01f55f1712df645989dcd2a... 2017-09-27 0.949354 \n", | |
"2 0003b7891c49786441d138c01f55f1712df645989dcd2a... 2017-10-22 0.928322 \n", | |
"3 0003b7891c49786441d138c01f55f1712df645989dcd2a... 2017-11-01 0.929362 \n", | |
"4 0003b7891c49786441d138c01f55f1712df645989dcd2a... 2017-12-04 0.948366 \n", | |
"\n", | |
" D_39 B_1 B_2 R_1 S_3 D_41 B_3 ... D_137 \\\n", | |
"0 0 0.021847 0.705988 0.001923 0.544475 0.0 0.040750 ... -1 \n", | |
"1 0 0.021467 0.519889 0.002993 0.505413 0.0 0.045911 ... -1 \n", | |
"2 0 0.017777 1.002783 0.007028 0.499947 0.0 0.027716 ... -1 \n", | |
"3 3 0.020861 0.811287 0.005151 0.505690 0.0 0.023649 ... -1 \n", | |
"4 6 0.037010 1.004685 0.006001 0.549692 0.0 0.019136 ... -1 \n", | |
"\n", | |
" D_138 D_139 D_140 D_141 D_142 D_143 D_144 D_145 target \n", | |
"0 -1 0 0 0.0 <NA> 0 0.002867 0 0 \n", | |
"1 -1 0 0 0.0 <NA> 0 0.004061 0 0 \n", | |
"2 -1 0 0 0.0 <NA> 0 0.005397 0 0 \n", | |
"3 -1 0 0 0.0 <NA> 0 0.002780 0 0 \n", | |
"4 -1 0 0 0.0 <NA> 0 0.005114 0 0 \n", | |
"\n", | |
"[5 rows x 191 columns]" | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"train = train.merge(trainl, on='customer_ID', how='left')\n", | |
"print(train.shape)\n", | |
"train.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"id": "a9752ac0-e607-41ab-85bc-13f1976e964f", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"train['cid'], _ = train.customer_ID.factorize()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"id": "469063ce-d3f0-48ee-944b-dc048f821dd1", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Verify target distribution is consistent across tr and va\n", | |
"0.2493462806763533 0.24835027876095958\n" | |
] | |
} | |
], | |
"source": [ | |
"mask = train['cid']%4 == 0\n", | |
"tr,va = train.loc[~mask],train.loc[mask]\n", | |
"print(\"Verify target distribution is consistent across tr and va\")\n", | |
"print(tr['target'].mean(), va['target'].mean())" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.10" | |
}, | |
"widgets": { | |
"application/vnd.jupyter.widget-state+json": { | |
"state": {}, | |
"version_major": 2, | |
"version_minor": 0 | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment