Skip to content

Instantly share code, notes, and snippets.

@daxiongshu
Created April 4, 2023 13:55
Show Gist options
  • Save daxiongshu/5b83d85b913d167a7cba3ffecd2a6dd0 to your computer and use it in GitHub Desktop.
Save daxiongshu/5b83d85b913d167a7cba3ffecd2a6dd0 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "54bcbfa3-0eec-4fcd-81d6-6260500c917f",
"metadata": {},
"outputs": [],
"source": [
"import os"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "c06cfa80-14c8-4b30-a463-ec43de8f5e1b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('23.04.00', '1.7.1')"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import cudf\n",
"import cupy\n",
"from tqdm import tqdm\n",
"import numpy as np\n",
"import gc\n",
"import xgboost as xgb\n",
"from utils import amex_metric_np\n",
"from pathlib import Path\n",
"\n",
"cudf.__version__, xgb.__version__"
]
},
{
"cell_type": "markdown",
"id": "9c237176-3f3b-4943-ab22-b1bae1e72942",
"metadata": {},
"source": [
"# Please register kaggle and install kaggle API by: \n",
"- `pip install kaggle`\n",
"- complete [authentication](https://www.kaggle.com/docs/api)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "e88971c2-5409-463d-91d6-cf4e42d0d881",
"metadata": {},
"outputs": [],
"source": [
"PATH = '/raid/data/ml/kaggle/amex'"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "fea38251-7b2b-48b6-a8e7-0d5fc4030c4d",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"Path(PATH).mkdir(parents=True,exist_ok=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "b65adb64-7f63-4418-80a1-40fb226327e9",
"metadata": {},
"outputs": [],
"source": [
"cmd = f'kaggle datasets download -d raddar/amex-data-integer-dtypes-parquet-format -p {PATH}'"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "375f1c13-dccb-49f8-ad45-af6ccf60ab30",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading amex-data-integer-dtypes-parquet-format.zip to /raid/data/ml/kaggle/amex\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|█████████▉| 4.06G/4.07G [01:13<00:00, 63.0MB/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 4.07G/4.07G [01:13<00:00, 59.3MB/s]\n"
]
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"os.system(cmd)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "8a7e4d9e-9059-4e6a-ad4a-a1630783fc3b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['amex-data-integer-dtypes-parquet-format.zip']"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"os.listdir(PATH)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "88d87bee-0287-44b9-aa3e-b033cb72ea57",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Archive: amex-data-integer-dtypes-parquet-format.zip\n",
" inflating: test.parquet \n",
" inflating: train.parquet \n"
]
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cmd = f'cd {PATH} && unzip amex-data-integer-dtypes-parquet-format.zip'\n",
"os.system(cmd)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "0216a228-c355-48f2-b038-caa9412fbaa8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['amex-data-integer-dtypes-parquet-format.zip',\n",
" 'train.parquet',\n",
" 'test.parquet']"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"os.listdir(PATH)"
]
},
{
"cell_type": "markdown",
"id": "f5a4489e-08d9-410a-b7b4-eec77ca1da73",
"metadata": {},
"source": [
"# Basic EDA"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "e3fb4208-5541-430f-be79-99915536c786",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(5531451, 190)\n",
"CPU times: user 1.26 s, sys: 1.31 s, total: 2.57 s\n",
"Wall time: 2.6 s\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>customer_ID</th>\n",
" <th>S_2</th>\n",
" <th>P_2</th>\n",
" <th>D_39</th>\n",
" <th>B_1</th>\n",
" <th>B_2</th>\n",
" <th>R_1</th>\n",
" <th>S_3</th>\n",
" <th>D_41</th>\n",
" <th>B_3</th>\n",
" <th>...</th>\n",
" <th>D_136</th>\n",
" <th>D_137</th>\n",
" <th>D_138</th>\n",
" <th>D_139</th>\n",
" <th>D_140</th>\n",
" <th>D_141</th>\n",
" <th>D_142</th>\n",
" <th>D_143</th>\n",
" <th>D_144</th>\n",
" <th>D_145</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...</td>\n",
" <td>2017-03-09</td>\n",
" <td>0.938469</td>\n",
" <td>0</td>\n",
" <td>0.008724</td>\n",
" <td>1.006838</td>\n",
" <td>0.009228</td>\n",
" <td>0.124035</td>\n",
" <td>0.0</td>\n",
" <td>0.004709</td>\n",
" <td>...</td>\n",
" <td>-1</td>\n",
" <td>-1</td>\n",
" <td>-1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>0.000610</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...</td>\n",
" <td>2017-04-07</td>\n",
" <td>0.936665</td>\n",
" <td>0</td>\n",
" <td>0.004923</td>\n",
" <td>1.000653</td>\n",
" <td>0.006151</td>\n",
" <td>0.126750</td>\n",
" <td>0.0</td>\n",
" <td>0.002714</td>\n",
" <td>...</td>\n",
" <td>-1</td>\n",
" <td>-1</td>\n",
" <td>-1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>0.005492</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...</td>\n",
" <td>2017-05-28</td>\n",
" <td>0.954180</td>\n",
" <td>3</td>\n",
" <td>0.021655</td>\n",
" <td>1.009672</td>\n",
" <td>0.006815</td>\n",
" <td>0.123977</td>\n",
" <td>0.0</td>\n",
" <td>0.009423</td>\n",
" <td>...</td>\n",
" <td>-1</td>\n",
" <td>-1</td>\n",
" <td>-1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>0.006986</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...</td>\n",
" <td>2017-06-13</td>\n",
" <td>0.960384</td>\n",
" <td>0</td>\n",
" <td>0.013683</td>\n",
" <td>1.002700</td>\n",
" <td>0.001373</td>\n",
" <td>0.117169</td>\n",
" <td>0.0</td>\n",
" <td>0.005531</td>\n",
" <td>...</td>\n",
" <td>-1</td>\n",
" <td>-1</td>\n",
" <td>-1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>0.006527</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...</td>\n",
" <td>2017-07-16</td>\n",
" <td>0.947248</td>\n",
" <td>0</td>\n",
" <td>0.015193</td>\n",
" <td>1.000727</td>\n",
" <td>0.007605</td>\n",
" <td>0.117325</td>\n",
" <td>0.0</td>\n",
" <td>0.009312</td>\n",
" <td>...</td>\n",
" <td>-1</td>\n",
" <td>-1</td>\n",
" <td>-1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>0.008126</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 190 columns</p>\n",
"</div>"
],
"text/plain": [
" customer_ID S_2 P_2 \\\n",
"0 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... 2017-03-09 0.938469 \n",
"1 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... 2017-04-07 0.936665 \n",
"2 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... 2017-05-28 0.954180 \n",
"3 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... 2017-06-13 0.960384 \n",
"4 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... 2017-07-16 0.947248 \n",
"\n",
" D_39 B_1 B_2 R_1 S_3 D_41 B_3 ... D_136 \\\n",
"0 0 0.008724 1.006838 0.009228 0.124035 0.0 0.004709 ... -1 \n",
"1 0 0.004923 1.000653 0.006151 0.126750 0.0 0.002714 ... -1 \n",
"2 3 0.021655 1.009672 0.006815 0.123977 0.0 0.009423 ... -1 \n",
"3 0 0.013683 1.002700 0.001373 0.117169 0.0 0.005531 ... -1 \n",
"4 0 0.015193 1.000727 0.007605 0.117325 0.0 0.009312 ... -1 \n",
"\n",
" D_137 D_138 D_139 D_140 D_141 D_142 D_143 D_144 D_145 \n",
"0 -1 -1 0 0 0.0 <NA> 0 0.000610 0 \n",
"1 -1 -1 0 0 0.0 <NA> 0 0.005492 0 \n",
"2 -1 -1 0 0 0.0 <NA> 0 0.006986 0 \n",
"3 -1 -1 0 0 0.0 <NA> 0 0.006527 0 \n",
"4 -1 -1 0 0 0.0 <NA> 0 0.008126 0 \n",
"\n",
"[5 rows x 190 columns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"train = cudf.read_parquet(f'{PATH}/train.parquet')\n",
"print(train.shape)\n",
"train.head()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "2727b9b8-59e8-40d9-90ee-4889bf35a52b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 5.63 ms, sys: 9.56 ms, total: 15.2 ms\n",
"Wall time: 14.1 ms\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>num_profiles</th>\n",
" </tr>\n",
" <tr>\n",
" <th>customer_ID</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>c761f5f5b15e563daa67f0a41c3ec2a870d3c9daaadf0cd11dd808d3aaa82c46</th>\n",
" <td>13</td>\n",
" </tr>\n",
" <tr>\n",
" <th>e16b5594d9dce9ebd2f8e0d7074391736b2641afa9e349f67a53f7cc780c120b</th>\n",
" <td>13</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8c846c26e1f1d4afa04977155c41bc3b6bb77c72efc5db3f592ec3d72f12cfdc</th>\n",
" <td>13</td>\n",
" </tr>\n",
" <tr>\n",
" <th>463e8a9b5b0161764bbbb0b5b58956bb8ebff6244219b21ac257a07364fa8dd9</th>\n",
" <td>13</td>\n",
" </tr>\n",
" <tr>\n",
" <th>92bbe3e2a159bcc838b86241471eb14153c8d712b6647feffbe49d5266cdfd3f</th>\n",
" <td>13</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" num_profiles\n",
"customer_ID \n",
"c761f5f5b15e563daa67f0a41c3ec2a870d3c9daaadf0cd... 13\n",
"e16b5594d9dce9ebd2f8e0d7074391736b2641afa9e349f... 13\n",
"8c846c26e1f1d4afa04977155c41bc3b6bb77c72efc5db3... 13\n",
"463e8a9b5b0161764bbbb0b5b58956bb8ebff6244219b21... 13\n",
"92bbe3e2a159bcc838b86241471eb14153c8d712b6647fe... 13"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"count_df = train.groupby('customer_ID').size().to_frame('num_profiles')\n",
"count_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "4da4f08f-8727-45a8-bb88-2db32fa4220a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"13"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"count_df.num_profiles.max()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "13e12b4c-0af3-4b73-ac21-d78ac271d36a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 12.6 ms, sys: 4.63 ms, total: 17.2 ms\n",
"Wall time: 15.7 ms\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>customer_ID</th>\n",
" <th>S_2</th>\n",
" <th>P_2</th>\n",
" <th>D_39</th>\n",
" <th>B_1</th>\n",
" <th>B_2</th>\n",
" <th>R_1</th>\n",
" <th>S_3</th>\n",
" <th>D_41</th>\n",
" <th>B_3</th>\n",
" <th>...</th>\n",
" <th>D_136</th>\n",
" <th>D_137</th>\n",
" <th>D_138</th>\n",
" <th>D_139</th>\n",
" <th>D_140</th>\n",
" <th>D_141</th>\n",
" <th>D_142</th>\n",
" <th>D_143</th>\n",
" <th>D_144</th>\n",
" <th>D_145</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...</td>\n",
" <td>2017-03-09</td>\n",
" <td>0.938469</td>\n",
" <td>0</td>\n",
" <td>0.008724</td>\n",
" <td>1.006838</td>\n",
" <td>0.009228</td>\n",
" <td>0.124035</td>\n",
" <td>0.0</td>\n",
" <td>0.004709</td>\n",
" <td>...</td>\n",
" <td>-1</td>\n",
" <td>-1</td>\n",
" <td>-1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>0.000610</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...</td>\n",
" <td>2017-04-07</td>\n",
" <td>0.936665</td>\n",
" <td>0</td>\n",
" <td>0.004923</td>\n",
" <td>1.000653</td>\n",
" <td>0.006151</td>\n",
" <td>0.126750</td>\n",
" <td>0.0</td>\n",
" <td>0.002714</td>\n",
" <td>...</td>\n",
" <td>-1</td>\n",
" <td>-1</td>\n",
" <td>-1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>0.005492</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...</td>\n",
" <td>2017-05-28</td>\n",
" <td>0.954180</td>\n",
" <td>3</td>\n",
" <td>0.021655</td>\n",
" <td>1.009672</td>\n",
" <td>0.006815</td>\n",
" <td>0.123977</td>\n",
" <td>0.0</td>\n",
" <td>0.009423</td>\n",
" <td>...</td>\n",
" <td>-1</td>\n",
" <td>-1</td>\n",
" <td>-1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>0.006986</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...</td>\n",
" <td>2017-06-13</td>\n",
" <td>0.960384</td>\n",
" <td>0</td>\n",
" <td>0.013683</td>\n",
" <td>1.002700</td>\n",
" <td>0.001373</td>\n",
" <td>0.117169</td>\n",
" <td>0.0</td>\n",
" <td>0.005531</td>\n",
" <td>...</td>\n",
" <td>-1</td>\n",
" <td>-1</td>\n",
" <td>-1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>0.006527</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...</td>\n",
" <td>2017-07-16</td>\n",
" <td>0.947248</td>\n",
" <td>0</td>\n",
" <td>0.015193</td>\n",
" <td>1.000727</td>\n",
" <td>0.007605</td>\n",
" <td>0.117325</td>\n",
" <td>0.0</td>\n",
" <td>0.009312</td>\n",
" <td>...</td>\n",
" <td>-1</td>\n",
" <td>-1</td>\n",
" <td>-1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>0.008126</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 190 columns</p>\n",
"</div>"
],
"text/plain": [
" customer_ID S_2 P_2 \\\n",
"0 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... 2017-03-09 0.938469 \n",
"1 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... 2017-04-07 0.936665 \n",
"2 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... 2017-05-28 0.954180 \n",
"3 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... 2017-06-13 0.960384 \n",
"4 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... 2017-07-16 0.947248 \n",
"\n",
" D_39 B_1 B_2 R_1 S_3 D_41 B_3 ... D_136 \\\n",
"0 0 0.008724 1.006838 0.009228 0.124035 0.0 0.004709 ... -1 \n",
"1 0 0.004923 1.000653 0.006151 0.126750 0.0 0.002714 ... -1 \n",
"2 3 0.021655 1.009672 0.006815 0.123977 0.0 0.009423 ... -1 \n",
"3 0 0.013683 1.002700 0.001373 0.117169 0.0 0.005531 ... -1 \n",
"4 0 0.015193 1.000727 0.007605 0.117325 0.0 0.009312 ... -1 \n",
"\n",
" D_137 D_138 D_139 D_140 D_141 D_142 D_143 D_144 D_145 \n",
"0 -1 -1 0 0 0.0 <NA> 0 0.000610 0 \n",
"1 -1 -1 0 0 0.0 <NA> 0 0.005492 0 \n",
"2 -1 -1 0 0 0.0 <NA> 0 0.006986 0 \n",
"3 -1 -1 0 0 0.0 <NA> 0 0.006527 0 \n",
"4 -1 -1 0 0 0.0 <NA> 0 0.008126 0 \n",
"\n",
"[5 rows x 190 columns]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"train['S_2'] = cudf.to_datetime(train['S_2'])\n",
"train.head()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "8f6fd282-8784-427b-9e5a-c341a13d91cd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(numpy.datetime64('2017-03-01T00:00:00.000000000'),\n",
" numpy.datetime64('2018-03-31T00:00:00.000000000'))"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.S_2.min(), train.S_2.max()"
]
},
{
"cell_type": "markdown",
"id": "6aefdd16-d078-4df4-9184-087bac6d22f9",
"metadata": {},
"source": [
"## Download the training data labels"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "4e366bc8-657a-4e4e-a4fa-cd65da90e2ae",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading train_labels.csv.zip to /raid/data/ml/kaggle/amex\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 31%|███ | 5.00M/16.2M [00:00<00:00, 29.6MB/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 16.2M/16.2M [00:00<00:00, 21.8MB/s]\n"
]
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cmd = f'kaggle competitions download -c amex-default-prediction -f train_labels.csv -p {PATH}/'\n",
"os.system(cmd)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "0c7fcdcd-936a-4580-9127-60fb130535f2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Archive: train_labels.csv.zip\n",
" inflating: train_labels.csv \n"
]
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cmd = f'cd {PATH} && unzip train_labels.csv.zip'\n",
"os.system(cmd)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "7ec73ed7-b457-49af-913a-c79e7f425f38",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(458913, 2)\n",
"CPU times: user 19.3 ms, sys: 11.2 ms, total: 30.5 ms\n",
"Wall time: 34.7 ms\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>customer_ID</th>\n",
" <th>target</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>00000fd6641609c6ece5454664794f0340ad84dddce9a2...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>00001b22f846c82c51f6e3958ccd81970162bae8b007e8...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>000041bdba6ecadd89a52d11886e8eaaec9325906c9723...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" customer_ID target\n",
"0 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f... 0\n",
"1 00000fd6641609c6ece5454664794f0340ad84dddce9a2... 0\n",
"2 00001b22f846c82c51f6e3958ccd81970162bae8b007e8... 0\n",
"3 000041bdba6ecadd89a52d11886e8eaaec9325906c9723... 0\n",
"4 00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a... 0"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"trainl = cudf.read_csv(f'{PATH}/train_labels.csv')\n",
"print(trainl.shape)\n",
"trainl.head()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "a586ed4d-f157-45c1-8a8f-e0b623aae80f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 340085\n",
"1 118828\n",
"Name: target, dtype: int32"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"trainl['target'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "1a695ca4-ba9f-448c-b338-7b13733c5ec6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(5531451, 191)\n",
"CPU times: user 49 ms, sys: 157 ms, total: 206 ms\n",
"Wall time: 205 ms\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>customer_ID</th>\n",
" <th>S_2</th>\n",
" <th>P_2</th>\n",
" <th>D_39</th>\n",
" <th>B_1</th>\n",
" <th>B_2</th>\n",
" <th>R_1</th>\n",
" <th>S_3</th>\n",
" <th>D_41</th>\n",
" <th>B_3</th>\n",
" <th>...</th>\n",
" <th>D_137</th>\n",
" <th>D_138</th>\n",
" <th>D_139</th>\n",
" <th>D_140</th>\n",
" <th>D_141</th>\n",
" <th>D_142</th>\n",
" <th>D_143</th>\n",
" <th>D_144</th>\n",
" <th>D_145</th>\n",
" <th>target</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0003b7891c49786441d138c01f55f1712df645989dcd2a...</td>\n",
" <td>2017-08-20</td>\n",
" <td>0.951668</td>\n",
" <td>0</td>\n",
" <td>0.021847</td>\n",
" <td>0.705988</td>\n",
" <td>0.001923</td>\n",
" <td>0.544475</td>\n",
" <td>0.0</td>\n",
" <td>0.040750</td>\n",
" <td>...</td>\n",
" <td>-1</td>\n",
" <td>-1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>0.002867</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0003b7891c49786441d138c01f55f1712df645989dcd2a...</td>\n",
" <td>2017-09-27</td>\n",
" <td>0.949354</td>\n",
" <td>0</td>\n",
" <td>0.021467</td>\n",
" <td>0.519889</td>\n",
" <td>0.002993</td>\n",
" <td>0.505413</td>\n",
" <td>0.0</td>\n",
" <td>0.045911</td>\n",
" <td>...</td>\n",
" <td>-1</td>\n",
" <td>-1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>0.004061</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0003b7891c49786441d138c01f55f1712df645989dcd2a...</td>\n",
" <td>2017-10-22</td>\n",
" <td>0.928322</td>\n",
" <td>0</td>\n",
" <td>0.017777</td>\n",
" <td>1.002783</td>\n",
" <td>0.007028</td>\n",
" <td>0.499947</td>\n",
" <td>0.0</td>\n",
" <td>0.027716</td>\n",
" <td>...</td>\n",
" <td>-1</td>\n",
" <td>-1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>0.005397</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0003b7891c49786441d138c01f55f1712df645989dcd2a...</td>\n",
" <td>2017-11-01</td>\n",
" <td>0.929362</td>\n",
" <td>3</td>\n",
" <td>0.020861</td>\n",
" <td>0.811287</td>\n",
" <td>0.005151</td>\n",
" <td>0.505690</td>\n",
" <td>0.0</td>\n",
" <td>0.023649</td>\n",
" <td>...</td>\n",
" <td>-1</td>\n",
" <td>-1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>0.002780</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0003b7891c49786441d138c01f55f1712df645989dcd2a...</td>\n",
" <td>2017-12-04</td>\n",
" <td>0.948366</td>\n",
" <td>6</td>\n",
" <td>0.037010</td>\n",
" <td>1.004685</td>\n",
" <td>0.006001</td>\n",
" <td>0.549692</td>\n",
" <td>0.0</td>\n",
" <td>0.019136</td>\n",
" <td>...</td>\n",
" <td>-1</td>\n",
" <td>-1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>0</td>\n",
" <td>0.005114</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 191 columns</p>\n",
"</div>"
],
"text/plain": [
" customer_ID S_2 P_2 \\\n",
"0 0003b7891c49786441d138c01f55f1712df645989dcd2a... 2017-08-20 0.951668 \n",
"1 0003b7891c49786441d138c01f55f1712df645989dcd2a... 2017-09-27 0.949354 \n",
"2 0003b7891c49786441d138c01f55f1712df645989dcd2a... 2017-10-22 0.928322 \n",
"3 0003b7891c49786441d138c01f55f1712df645989dcd2a... 2017-11-01 0.929362 \n",
"4 0003b7891c49786441d138c01f55f1712df645989dcd2a... 2017-12-04 0.948366 \n",
"\n",
" D_39 B_1 B_2 R_1 S_3 D_41 B_3 ... D_137 \\\n",
"0 0 0.021847 0.705988 0.001923 0.544475 0.0 0.040750 ... -1 \n",
"1 0 0.021467 0.519889 0.002993 0.505413 0.0 0.045911 ... -1 \n",
"2 0 0.017777 1.002783 0.007028 0.499947 0.0 0.027716 ... -1 \n",
"3 3 0.020861 0.811287 0.005151 0.505690 0.0 0.023649 ... -1 \n",
"4 6 0.037010 1.004685 0.006001 0.549692 0.0 0.019136 ... -1 \n",
"\n",
" D_138 D_139 D_140 D_141 D_142 D_143 D_144 D_145 target \n",
"0 -1 0 0 0.0 <NA> 0 0.002867 0 0 \n",
"1 -1 0 0 0.0 <NA> 0 0.004061 0 0 \n",
"2 -1 0 0 0.0 <NA> 0 0.005397 0 0 \n",
"3 -1 0 0 0.0 <NA> 0 0.002780 0 0 \n",
"4 -1 0 0 0.0 <NA> 0 0.005114 0 0 \n",
"\n",
"[5 rows x 191 columns]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"train = train.merge(trainl, on='customer_ID', how='left')\n",
"print(train.shape)\n",
"train.head()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "a9752ac0-e607-41ab-85bc-13f1976e964f",
"metadata": {},
"outputs": [],
"source": [
"train['cid'], _ = train.customer_ID.factorize()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "469063ce-d3f0-48ee-944b-dc048f821dd1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Verify target distribution is consistent across tr and va\n",
"0.2493462806763533 0.24835027876095958\n"
]
}
],
"source": [
"mask = train['cid']%4 == 0\n",
"tr,va = train.loc[~mask],train.loc[mask]\n",
"print(\"Verify target distribution is consistent across tr and va\")\n",
"print(tr['target'].mean(), va['target'].mean())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.10"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"state": {},
"version_major": 2,
"version_minor": 0
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment