Skip to content

Instantly share code, notes, and snippets.

@cyacyamaru
Created May 3, 2019 04:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cyacyamaru/d4d35ee0e159596175638d1ba80bd360 to your computer and use it in GitHub Desktop.
Save cyacyamaru/d4d35ee0e159596175638d1ba80bd360 to your computer and use it in GitHub Desktop.
santander_late2
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"##https://www.kaggle.com/jiweiliu/lgb-2-leaves-augment\n",
"##https://www.kaggle.com/yag320/list-of-fake-samples-and-public-private-lb-split\n",
"##https://www.kaggle.com/qitian51212/simple-magic-var-0-922"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# LOAD LIBRARIES\n",
"import numpy as np\n",
"import pandas as pd\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\")\n",
"from sklearn.metrics import roc_auc_score\n",
"import lightgbm as lgb\n",
"import statsmodels.api as sm\n",
"from tqdm import tnrange, tqdm, tqdm_notebook\n",
"from sklearn.model_selection import KFold, StratifiedKFold"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 100000 real test\n",
"Found 100000 fake test\n"
]
}
],
"source": [
"# GET INDICIES OF REAL TEST DATA\n",
"#######################\n",
"# TAKE FROM YAG320'S KERNEL\n",
"# https://www.kaggle.com/yag320/list-of-fake-samples-and-public-private-lb-split\n",
"\n",
"df_test = pd.read_csv('input/test_santander.csv')\n",
"df_test.drop(['ID_code'], axis=1, inplace=True)\n",
"df_test = df_test.values\n",
"\n",
"unique_samples = []\n",
"unique_count = np.zeros_like(df_test)\n",
"for feature in range(df_test.shape[1]):\n",
" _, index_, count_ = np.unique(df_test[:, feature], return_counts=True, return_index=True)\n",
" unique_count[index_[count_ == 1], feature] += 1\n",
"\n",
"# Samples which have unique values are real the others are fake\n",
"real_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]\n",
"synthetic_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]\n",
"\n",
"print('Found',len(real_samples_indexes),'real test')\n",
"print('Found',len(synthetic_samples_indexes),'fake test')\n",
"#######################"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true,
"scrolled": false
},
"outputs": [],
"source": [
"#Load Datas\n",
"#df_testをreal_testで再使用するのでもう一回load\n",
"df_train = pd.read_csv('input/train_santander.csv')\n",
"df_test = pd.read_csv('input/test_santander.csv')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(200000, 202)\n",
"(200000, 201)\n"
]
}
],
"source": [
"print(df_train.shape)\n",
"print(df_test.shape)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID_code</th>\n",
" <th>target</th>\n",
" <th>var_0</th>\n",
" <th>var_1</th>\n",
" <th>var_2</th>\n",
" <th>var_3</th>\n",
" <th>var_4</th>\n",
" <th>var_5</th>\n",
" <th>var_6</th>\n",
" <th>var_7</th>\n",
" <th>...</th>\n",
" <th>var_190</th>\n",
" <th>var_191</th>\n",
" <th>var_192</th>\n",
" <th>var_193</th>\n",
" <th>var_194</th>\n",
" <th>var_195</th>\n",
" <th>var_196</th>\n",
" <th>var_197</th>\n",
" <th>var_198</th>\n",
" <th>var_199</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>train_0</td>\n",
" <td>0</td>\n",
" <td>8.9255</td>\n",
" <td>-6.7863</td>\n",
" <td>11.9081</td>\n",
" <td>5.0930</td>\n",
" <td>11.4607</td>\n",
" <td>-9.2834</td>\n",
" <td>5.1187</td>\n",
" <td>18.6266</td>\n",
" <td>...</td>\n",
" <td>4.4354</td>\n",
" <td>3.9642</td>\n",
" <td>3.1364</td>\n",
" <td>1.6910</td>\n",
" <td>18.5227</td>\n",
" <td>-2.3978</td>\n",
" <td>7.8784</td>\n",
" <td>8.5635</td>\n",
" <td>12.7803</td>\n",
" <td>-1.0914</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>train_1</td>\n",
" <td>0</td>\n",
" <td>11.5006</td>\n",
" <td>-4.1473</td>\n",
" <td>13.8588</td>\n",
" <td>5.3890</td>\n",
" <td>12.3622</td>\n",
" <td>7.0433</td>\n",
" <td>5.6208</td>\n",
" <td>16.5338</td>\n",
" <td>...</td>\n",
" <td>7.6421</td>\n",
" <td>7.7214</td>\n",
" <td>2.5837</td>\n",
" <td>10.9516</td>\n",
" <td>15.4305</td>\n",
" <td>2.0339</td>\n",
" <td>8.1267</td>\n",
" <td>8.7889</td>\n",
" <td>18.3560</td>\n",
" <td>1.9518</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>train_2</td>\n",
" <td>0</td>\n",
" <td>8.6093</td>\n",
" <td>-2.7457</td>\n",
" <td>12.0805</td>\n",
" <td>7.8928</td>\n",
" <td>10.5825</td>\n",
" <td>-9.0837</td>\n",
" <td>6.9427</td>\n",
" <td>14.6155</td>\n",
" <td>...</td>\n",
" <td>2.9057</td>\n",
" <td>9.7905</td>\n",
" <td>1.6704</td>\n",
" <td>1.6858</td>\n",
" <td>21.6042</td>\n",
" <td>3.1417</td>\n",
" <td>-6.5213</td>\n",
" <td>8.2675</td>\n",
" <td>14.7222</td>\n",
" <td>0.3965</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>train_3</td>\n",
" <td>0</td>\n",
" <td>11.0604</td>\n",
" <td>-2.1518</td>\n",
" <td>8.9522</td>\n",
" <td>7.1957</td>\n",
" <td>12.5846</td>\n",
" <td>-1.8361</td>\n",
" <td>5.8428</td>\n",
" <td>14.9250</td>\n",
" <td>...</td>\n",
" <td>4.4666</td>\n",
" <td>4.7433</td>\n",
" <td>0.7178</td>\n",
" <td>1.4214</td>\n",
" <td>23.0347</td>\n",
" <td>-1.2706</td>\n",
" <td>-2.9275</td>\n",
" <td>10.2922</td>\n",
" <td>17.9697</td>\n",
" <td>-8.9996</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>train_4</td>\n",
" <td>0</td>\n",
" <td>9.8369</td>\n",
" <td>-1.4834</td>\n",
" <td>12.8746</td>\n",
" <td>6.6375</td>\n",
" <td>12.2772</td>\n",
" <td>2.4486</td>\n",
" <td>5.9405</td>\n",
" <td>19.2514</td>\n",
" <td>...</td>\n",
" <td>-1.4905</td>\n",
" <td>9.5214</td>\n",
" <td>-0.1508</td>\n",
" <td>9.1942</td>\n",
" <td>13.2876</td>\n",
" <td>-1.5121</td>\n",
" <td>3.9267</td>\n",
" <td>9.5031</td>\n",
" <td>17.9974</td>\n",
" <td>-8.8104</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 202 columns</p>\n",
"</div>"
],
"text/plain": [
" ID_code target var_0 var_1 var_2 var_3 var_4 var_5 var_6 \\\n",
"0 train_0 0 8.9255 -6.7863 11.9081 5.0930 11.4607 -9.2834 5.1187 \n",
"1 train_1 0 11.5006 -4.1473 13.8588 5.3890 12.3622 7.0433 5.6208 \n",
"2 train_2 0 8.6093 -2.7457 12.0805 7.8928 10.5825 -9.0837 6.9427 \n",
"3 train_3 0 11.0604 -2.1518 8.9522 7.1957 12.5846 -1.8361 5.8428 \n",
"4 train_4 0 9.8369 -1.4834 12.8746 6.6375 12.2772 2.4486 5.9405 \n",
"\n",
" var_7 ... var_190 var_191 var_192 var_193 var_194 var_195 \\\n",
"0 18.6266 ... 4.4354 3.9642 3.1364 1.6910 18.5227 -2.3978 \n",
"1 16.5338 ... 7.6421 7.7214 2.5837 10.9516 15.4305 2.0339 \n",
"2 14.6155 ... 2.9057 9.7905 1.6704 1.6858 21.6042 3.1417 \n",
"3 14.9250 ... 4.4666 4.7433 0.7178 1.4214 23.0347 -1.2706 \n",
"4 19.2514 ... -1.4905 9.5214 -0.1508 9.1942 13.2876 -1.5121 \n",
"\n",
" var_196 var_197 var_198 var_199 \n",
"0 7.8784 8.5635 12.7803 -1.0914 \n",
"1 8.1267 8.7889 18.3560 1.9518 \n",
"2 -6.5213 8.2675 14.7222 0.3965 \n",
"3 -2.9275 10.2922 17.9697 -8.9996 \n",
"4 3.9267 9.5031 17.9974 -8.8104 \n",
"\n",
"[5 rows x 202 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Check data\n",
"df_train.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID_code</th>\n",
" <th>var_0</th>\n",
" <th>var_1</th>\n",
" <th>var_2</th>\n",
" <th>var_3</th>\n",
" <th>var_4</th>\n",
" <th>var_5</th>\n",
" <th>var_6</th>\n",
" <th>var_7</th>\n",
" <th>var_8</th>\n",
" <th>...</th>\n",
" <th>var_190</th>\n",
" <th>var_191</th>\n",
" <th>var_192</th>\n",
" <th>var_193</th>\n",
" <th>var_194</th>\n",
" <th>var_195</th>\n",
" <th>var_196</th>\n",
" <th>var_197</th>\n",
" <th>var_198</th>\n",
" <th>var_199</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>test_0</td>\n",
" <td>11.0656</td>\n",
" <td>7.7798</td>\n",
" <td>12.9536</td>\n",
" <td>9.4292</td>\n",
" <td>11.4327</td>\n",
" <td>-2.3805</td>\n",
" <td>5.8493</td>\n",
" <td>18.2675</td>\n",
" <td>2.1337</td>\n",
" <td>...</td>\n",
" <td>-2.1556</td>\n",
" <td>11.8495</td>\n",
" <td>-1.4300</td>\n",
" <td>2.4508</td>\n",
" <td>13.7112</td>\n",
" <td>2.4669</td>\n",
" <td>4.3654</td>\n",
" <td>10.7200</td>\n",
" <td>15.4722</td>\n",
" <td>-8.7197</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>test_1</td>\n",
" <td>8.5304</td>\n",
" <td>1.2543</td>\n",
" <td>11.3047</td>\n",
" <td>5.1858</td>\n",
" <td>9.1974</td>\n",
" <td>-4.0117</td>\n",
" <td>6.0196</td>\n",
" <td>18.6316</td>\n",
" <td>-4.4131</td>\n",
" <td>...</td>\n",
" <td>10.6165</td>\n",
" <td>8.8349</td>\n",
" <td>0.9403</td>\n",
" <td>10.1282</td>\n",
" <td>15.5765</td>\n",
" <td>0.4773</td>\n",
" <td>-1.4852</td>\n",
" <td>9.8714</td>\n",
" <td>19.1293</td>\n",
" <td>-20.9760</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>test_2</td>\n",
" <td>5.4827</td>\n",
" <td>-10.3581</td>\n",
" <td>10.1407</td>\n",
" <td>7.0479</td>\n",
" <td>10.2628</td>\n",
" <td>9.8052</td>\n",
" <td>4.8950</td>\n",
" <td>20.2537</td>\n",
" <td>1.5233</td>\n",
" <td>...</td>\n",
" <td>-0.7484</td>\n",
" <td>10.9935</td>\n",
" <td>1.9803</td>\n",
" <td>2.1800</td>\n",
" <td>12.9813</td>\n",
" <td>2.1281</td>\n",
" <td>-7.1086</td>\n",
" <td>7.0618</td>\n",
" <td>19.8956</td>\n",
" <td>-23.1794</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>test_3</td>\n",
" <td>8.5374</td>\n",
" <td>-1.3222</td>\n",
" <td>12.0220</td>\n",
" <td>6.5749</td>\n",
" <td>8.8458</td>\n",
" <td>3.1744</td>\n",
" <td>4.9397</td>\n",
" <td>20.5660</td>\n",
" <td>3.3755</td>\n",
" <td>...</td>\n",
" <td>9.5702</td>\n",
" <td>9.0766</td>\n",
" <td>1.6580</td>\n",
" <td>3.5813</td>\n",
" <td>15.1874</td>\n",
" <td>3.1656</td>\n",
" <td>3.9567</td>\n",
" <td>9.2295</td>\n",
" <td>13.0168</td>\n",
" <td>-4.2108</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>test_4</td>\n",
" <td>11.7058</td>\n",
" <td>-0.1327</td>\n",
" <td>14.1295</td>\n",
" <td>7.7506</td>\n",
" <td>9.1035</td>\n",
" <td>-8.5848</td>\n",
" <td>6.8595</td>\n",
" <td>10.6048</td>\n",
" <td>2.9890</td>\n",
" <td>...</td>\n",
" <td>4.2259</td>\n",
" <td>9.1723</td>\n",
" <td>1.2835</td>\n",
" <td>3.3778</td>\n",
" <td>19.5542</td>\n",
" <td>-0.2860</td>\n",
" <td>-5.1612</td>\n",
" <td>7.2882</td>\n",
" <td>13.9260</td>\n",
" <td>-9.1846</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 201 columns</p>\n",
"</div>"
],
"text/plain": [
" ID_code var_0 var_1 var_2 var_3 var_4 var_5 var_6 \\\n",
"0 test_0 11.0656 7.7798 12.9536 9.4292 11.4327 -2.3805 5.8493 \n",
"1 test_1 8.5304 1.2543 11.3047 5.1858 9.1974 -4.0117 6.0196 \n",
"2 test_2 5.4827 -10.3581 10.1407 7.0479 10.2628 9.8052 4.8950 \n",
"3 test_3 8.5374 -1.3222 12.0220 6.5749 8.8458 3.1744 4.9397 \n",
"4 test_4 11.7058 -0.1327 14.1295 7.7506 9.1035 -8.5848 6.8595 \n",
"\n",
" var_7 var_8 ... var_190 var_191 var_192 var_193 var_194 \\\n",
"0 18.2675 2.1337 ... -2.1556 11.8495 -1.4300 2.4508 13.7112 \n",
"1 18.6316 -4.4131 ... 10.6165 8.8349 0.9403 10.1282 15.5765 \n",
"2 20.2537 1.5233 ... -0.7484 10.9935 1.9803 2.1800 12.9813 \n",
"3 20.5660 3.3755 ... 9.5702 9.0766 1.6580 3.5813 15.1874 \n",
"4 10.6048 2.9890 ... 4.2259 9.1723 1.2835 3.3778 19.5542 \n",
"\n",
" var_195 var_196 var_197 var_198 var_199 \n",
"0 2.4669 4.3654 10.7200 15.4722 -8.7197 \n",
"1 0.4773 -1.4852 9.8714 19.1293 -20.9760 \n",
"2 2.1281 -7.1086 7.0618 19.8956 -23.1794 \n",
"3 3.1656 3.9567 9.2295 13.0168 -4.2108 \n",
"4 -0.2860 -5.1612 7.2882 13.9260 -9.1846 \n",
"\n",
"[5 rows x 201 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_test.head()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#real_testのデータ型整理?\n",
"real_ids = list(set(real_samples_indexes))\n",
"real_test = df_test.iloc[real_ids]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#以下主に下記kernelから引用\n",
"##https://www.kaggle.com/jiweiliu/lgb-2-leaves-augment\n",
"##https://www.kaggle.com/qitian51212/simple-magic-var-0-922\n",
"features = [col for col in df_train.columns if col not in ['ID_code', 'target']]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"200\n"
]
}
],
"source": [
"print(len(features))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6e36f06d3f2a4046b9be832ac8af30ac"
}
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "32bf2acea0c84d59b70426781aad72ab"
}
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"#concatenate train data and real test data with new features\n",
"df = pd.concat([df_train,real_test], axis = 0)\n",
"\n",
"for feat in tqdm_notebook(features):\n",
" df[feat+'_var'] = df.groupby([feat])[feat].transform('var')\n",
"\n",
"for feat in tqdm_notebook(features):\n",
" df[feat+'plus_'] = df[feat] + df[feat+'_var']\n",
"\n",
"drop_features = [c for c in df.columns if '_var' in c]\n",
"df.drop(drop_features, axis=1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#divide into new train and test sets\n",
"df_train = df.iloc[:df_train.shape[0]]\n",
"df_test2 = df.iloc[df_train.shape[0]:]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#新trainデータのtargetを使用してfeaturesを再定義\n",
"features = [col for col in df_train.columns if col not in ['ID_code', 'target']]\n",
"target = df_train['target']"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(200000, 402)\n",
"(100000, 402)\n",
"400\n"
]
}
],
"source": [
"print(df_train.shape)\n",
"print(df_test2.shape)\n",
"print(len(features))"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID_code</th>\n",
" <th>target</th>\n",
" <th>var_0</th>\n",
" <th>var_1</th>\n",
" <th>var_10</th>\n",
" <th>var_100</th>\n",
" <th>var_101</th>\n",
" <th>var_102</th>\n",
" <th>var_103</th>\n",
" <th>var_104</th>\n",
" <th>...</th>\n",
" <th>var_190plus_</th>\n",
" <th>var_191plus_</th>\n",
" <th>var_192plus_</th>\n",
" <th>var_193plus_</th>\n",
" <th>var_194plus_</th>\n",
" <th>var_195plus_</th>\n",
" <th>var_196plus_</th>\n",
" <th>var_197plus_</th>\n",
" <th>var_198plus_</th>\n",
" <th>var_199plus_</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>train_0</td>\n",
" <td>0.0</td>\n",
" <td>8.9255</td>\n",
" <td>-6.7863</td>\n",
" <td>2.9252</td>\n",
" <td>9.4763</td>\n",
" <td>13.3102</td>\n",
" <td>26.5376</td>\n",
" <td>1.4403</td>\n",
" <td>14.7100</td>\n",
" <td>...</td>\n",
" <td>4.4354</td>\n",
" <td>3.9642</td>\n",
" <td>3.1364</td>\n",
" <td>1.6910</td>\n",
" <td>18.5227</td>\n",
" <td>-2.3978</td>\n",
" <td>7.8784</td>\n",
" <td>8.5635</td>\n",
" <td>12.7803</td>\n",
" <td>-1.0914</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>train_1</td>\n",
" <td>0.0</td>\n",
" <td>11.5006</td>\n",
" <td>-4.1473</td>\n",
" <td>-0.4032</td>\n",
" <td>-13.6950</td>\n",
" <td>8.4068</td>\n",
" <td>35.4734</td>\n",
" <td>1.7093</td>\n",
" <td>15.1866</td>\n",
" <td>...</td>\n",
" <td>7.6421</td>\n",
" <td>7.7214</td>\n",
" <td>2.5837</td>\n",
" <td>10.9516</td>\n",
" <td>15.4305</td>\n",
" <td>2.0339</td>\n",
" <td>8.1267</td>\n",
" <td>8.7889</td>\n",
" <td>18.3560</td>\n",
" <td>1.9518</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>train_2</td>\n",
" <td>0.0</td>\n",
" <td>8.6093</td>\n",
" <td>-2.7457</td>\n",
" <td>-0.3249</td>\n",
" <td>-0.3939</td>\n",
" <td>12.6317</td>\n",
" <td>14.8863</td>\n",
" <td>1.3854</td>\n",
" <td>15.0284</td>\n",
" <td>...</td>\n",
" <td>2.9057</td>\n",
" <td>9.7905</td>\n",
" <td>1.6704</td>\n",
" <td>1.6858</td>\n",
" <td>21.6042</td>\n",
" <td>3.1417</td>\n",
" <td>-6.5213</td>\n",
" <td>8.2675</td>\n",
" <td>14.7222</td>\n",
" <td>0.3965</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>train_3</td>\n",
" <td>0.0</td>\n",
" <td>11.0604</td>\n",
" <td>-2.1518</td>\n",
" <td>2.3061</td>\n",
" <td>-19.8592</td>\n",
" <td>22.5316</td>\n",
" <td>18.6129</td>\n",
" <td>1.3512</td>\n",
" <td>9.3291</td>\n",
" <td>...</td>\n",
" <td>4.4666</td>\n",
" <td>4.7433</td>\n",
" <td>0.7178</td>\n",
" <td>1.4214</td>\n",
" <td>23.0347</td>\n",
" <td>-1.2706</td>\n",
" <td>-2.9275</td>\n",
" <td>10.2922</td>\n",
" <td>17.9697</td>\n",
" <td>-8.9996</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>train_4</td>\n",
" <td>0.0</td>\n",
" <td>9.8369</td>\n",
" <td>-1.4834</td>\n",
" <td>-9.4458</td>\n",
" <td>-22.9264</td>\n",
" <td>12.3562</td>\n",
" <td>17.3410</td>\n",
" <td>1.6940</td>\n",
" <td>7.1179</td>\n",
" <td>...</td>\n",
" <td>-1.4905</td>\n",
" <td>9.5214</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>-1.5121</td>\n",
" <td>3.9267</td>\n",
" <td>9.5031</td>\n",
" <td>17.9974</td>\n",
" <td>-8.8104</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 402 columns</p>\n",
"</div>"
],
"text/plain": [
" ID_code target var_0 var_1 var_10 var_100 var_101 var_102 \\\n",
"0 train_0 0.0 8.9255 -6.7863 2.9252 9.4763 13.3102 26.5376 \n",
"1 train_1 0.0 11.5006 -4.1473 -0.4032 -13.6950 8.4068 35.4734 \n",
"2 train_2 0.0 8.6093 -2.7457 -0.3249 -0.3939 12.6317 14.8863 \n",
"3 train_3 0.0 11.0604 -2.1518 2.3061 -19.8592 22.5316 18.6129 \n",
"4 train_4 0.0 9.8369 -1.4834 -9.4458 -22.9264 12.3562 17.3410 \n",
"\n",
" var_103 var_104 ... var_190plus_ var_191plus_ var_192plus_ \\\n",
"0 1.4403 14.7100 ... 4.4354 3.9642 3.1364 \n",
"1 1.7093 15.1866 ... 7.6421 7.7214 2.5837 \n",
"2 1.3854 15.0284 ... 2.9057 9.7905 1.6704 \n",
"3 1.3512 9.3291 ... 4.4666 4.7433 0.7178 \n",
"4 1.6940 7.1179 ... -1.4905 9.5214 NaN \n",
"\n",
" var_193plus_ var_194plus_ var_195plus_ var_196plus_ var_197plus_ \\\n",
"0 1.6910 18.5227 -2.3978 7.8784 8.5635 \n",
"1 10.9516 15.4305 2.0339 8.1267 8.7889 \n",
"2 1.6858 21.6042 3.1417 -6.5213 8.2675 \n",
"3 1.4214 23.0347 -1.2706 -2.9275 10.2922 \n",
"4 NaN NaN -1.5121 3.9267 9.5031 \n",
"\n",
" var_198plus_ var_199plus_ \n",
"0 12.7803 -1.0914 \n",
"1 18.3560 1.9518 \n",
"2 14.7222 0.3965 \n",
"3 17.9697 -8.9996 \n",
"4 17.9974 -8.8104 \n",
"\n",
"[5 rows x 402 columns]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_train.head()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID_code</th>\n",
" <th>target</th>\n",
" <th>var_0</th>\n",
" <th>var_1</th>\n",
" <th>var_10</th>\n",
" <th>var_100</th>\n",
" <th>var_101</th>\n",
" <th>var_102</th>\n",
" <th>var_103</th>\n",
" <th>var_104</th>\n",
" <th>...</th>\n",
" <th>var_190plus_</th>\n",
" <th>var_191plus_</th>\n",
" <th>var_192plus_</th>\n",
" <th>var_193plus_</th>\n",
" <th>var_194plus_</th>\n",
" <th>var_195plus_</th>\n",
" <th>var_196plus_</th>\n",
" <th>var_197plus_</th>\n",
" <th>var_198plus_</th>\n",
" <th>var_199plus_</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>test_3</td>\n",
" <td>NaN</td>\n",
" <td>8.5374</td>\n",
" <td>-1.3222</td>\n",
" <td>0.0095</td>\n",
" <td>1.7021</td>\n",
" <td>2.5363</td>\n",
" <td>3.8763</td>\n",
" <td>1.5173</td>\n",
" <td>13.4083</td>\n",
" <td>...</td>\n",
" <td>9.5702</td>\n",
" <td>9.0766</td>\n",
" <td>1.6580</td>\n",
" <td>3.5813</td>\n",
" <td>15.1874</td>\n",
" <td>3.1656</td>\n",
" <td>3.9567</td>\n",
" <td>9.2295</td>\n",
" <td>13.0168</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>test_7</td>\n",
" <td>NaN</td>\n",
" <td>17.3035</td>\n",
" <td>-2.4212</td>\n",
" <td>4.0355</td>\n",
" <td>-22.7363</td>\n",
" <td>11.6984</td>\n",
" <td>18.0784</td>\n",
" <td>1.7108</td>\n",
" <td>9.2897</td>\n",
" <td>...</td>\n",
" <td>4.4676</td>\n",
" <td>4.4214</td>\n",
" <td>0.9303</td>\n",
" <td>1.4994</td>\n",
" <td>15.2648</td>\n",
" <td>-1.7931</td>\n",
" <td>6.5316</td>\n",
" <td>10.4855</td>\n",
" <td>NaN</td>\n",
" <td>0.7283</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>test_11</td>\n",
" <td>NaN</td>\n",
" <td>10.6137</td>\n",
" <td>-2.1898</td>\n",
" <td>2.4620</td>\n",
" <td>-14.9298</td>\n",
" <td>23.1641</td>\n",
" <td>5.2171</td>\n",
" <td>1.7760</td>\n",
" <td>8.4856</td>\n",
" <td>...</td>\n",
" <td>13.1683</td>\n",
" <td>4.0625</td>\n",
" <td>-0.1537</td>\n",
" <td>7.9787</td>\n",
" <td>18.4518</td>\n",
" <td>0.1000</td>\n",
" <td>NaN</td>\n",
" <td>9.2355</td>\n",
" <td>15.0721</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>test_15</td>\n",
" <td>NaN</td>\n",
" <td>14.8595</td>\n",
" <td>-4.5378</td>\n",
" <td>3.9229</td>\n",
" <td>-13.1135</td>\n",
" <td>19.4192</td>\n",
" <td>21.7291</td>\n",
" <td>1.3708</td>\n",
" <td>8.5129</td>\n",
" <td>...</td>\n",
" <td>2.6735</td>\n",
" <td>5.8526</td>\n",
" <td>NaN</td>\n",
" <td>2.5020</td>\n",
" <td>22.8224</td>\n",
" <td>-0.9325</td>\n",
" <td>8.6849</td>\n",
" <td>10.2848</td>\n",
" <td>17.4932</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>test_16</td>\n",
" <td>NaN</td>\n",
" <td>14.1732</td>\n",
" <td>-5.1490</td>\n",
" <td>-5.8456</td>\n",
" <td>-2.4555</td>\n",
" <td>12.4817</td>\n",
" <td>17.8563</td>\n",
" <td>1.6619</td>\n",
" <td>12.7459</td>\n",
" <td>...</td>\n",
" <td>0.8640</td>\n",
" <td>5.9058</td>\n",
" <td>1.3140</td>\n",
" <td>4.8961</td>\n",
" <td>20.1087</td>\n",
" <td>1.1051</td>\n",
" <td>7.7184</td>\n",
" <td>9.3406</td>\n",
" <td>21.1746</td>\n",
" <td>-2.0098</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 402 columns</p>\n",
"</div>"
],
"text/plain": [
" ID_code target var_0 var_1 var_10 var_100 var_101 var_102 \\\n",
"3 test_3 NaN 8.5374 -1.3222 0.0095 1.7021 2.5363 3.8763 \n",
"7 test_7 NaN 17.3035 -2.4212 4.0355 -22.7363 11.6984 18.0784 \n",
"11 test_11 NaN 10.6137 -2.1898 2.4620 -14.9298 23.1641 5.2171 \n",
"15 test_15 NaN 14.8595 -4.5378 3.9229 -13.1135 19.4192 21.7291 \n",
"16 test_16 NaN 14.1732 -5.1490 -5.8456 -2.4555 12.4817 17.8563 \n",
"\n",
" var_103 var_104 ... var_190plus_ var_191plus_ var_192plus_ \\\n",
"3 1.5173 13.4083 ... 9.5702 9.0766 1.6580 \n",
"7 1.7108 9.2897 ... 4.4676 4.4214 0.9303 \n",
"11 1.7760 8.4856 ... 13.1683 4.0625 -0.1537 \n",
"15 1.3708 8.5129 ... 2.6735 5.8526 NaN \n",
"16 1.6619 12.7459 ... 0.8640 5.9058 1.3140 \n",
"\n",
" var_193plus_ var_194plus_ var_195plus_ var_196plus_ var_197plus_ \\\n",
"3 3.5813 15.1874 3.1656 3.9567 9.2295 \n",
"7 1.4994 15.2648 -1.7931 6.5316 10.4855 \n",
"11 7.9787 18.4518 0.1000 NaN 9.2355 \n",
"15 2.5020 22.8224 -0.9325 8.6849 10.2848 \n",
"16 4.8961 20.1087 1.1051 7.7184 9.3406 \n",
"\n",
" var_198plus_ var_199plus_ \n",
"3 13.0168 NaN \n",
"7 NaN 0.7283 \n",
"11 15.0721 NaN \n",
"15 17.4932 NaN \n",
"16 21.1746 -2.0098 \n",
"\n",
"[5 rows x 402 columns]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_test2.head()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"#LightGBM parameters\n",
"#Parameters are referenced by https://www.kaggle.com/jiweiliu/lgb-2-leaves-augment\n",
"random_state = 4242\n",
"\n",
"lgb_params = {\n",
" \"objective\" : \"binary\",\n",
" \"metric\" : \"auc\",\n",
" \"boosting\": 'gbdt',\n",
" \"max_depth\" : -1,\n",
" \"num_leaves\" : 13,\n",
" \"learning_rate\" : 0.01,\n",
" \"bagging_freq\": 5,\n",
" \"bagging_fraction\" : 0.4,\n",
" \"feature_fraction\" : 0.05,\n",
" \"min_data_in_leaf\": 80,\n",
" \"min_sum_heassian_in_leaf\": 10,\n",
" \"tree_learner\": \"serial\",\n",
" \"boost_from_average\": \"false\",\n",
" #\"lambda_l1\" : 5,\n",
" #\"lambda_l2\" : 5,\n",
" \"bagging_seed\" : random_state,\n",
" \"verbosity\" : 1,\n",
" \"seed\": random_state\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)\n",
"oof = np.zeros(len(df_train))\n",
"predictions = np.zeros(len(df_test2))\n",
"val_aucs = []\n",
"feature_importance_df = pd.DataFrame()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fold 0\n",
"Training until validation scores don't improve for 3000 rounds.\n",
"[5000]\ttraining's auc: 0.950915\tvalid_1's auc: 0.923566\n",
"[10000]\ttraining's auc: 0.9706\tvalid_1's auc: 0.92516\n",
"Early stopping, best iteration is:\n",
"[11253]\ttraining's auc: 0.974484\tvalid_1's auc: 0.925211\n",
"Fold 1\n",
"Training until validation scores don't improve for 3000 rounds.\n",
"[5000]\ttraining's auc: 0.951279\tvalid_1's auc: 0.921483\n",
"[10000]\ttraining's auc: 0.970845\tvalid_1's auc: 0.922899\n",
"Early stopping, best iteration is:\n",
"[10620]\ttraining's auc: 0.972768\tvalid_1's auc: 0.922925\n",
"Fold 2\n",
"Training until validation scores don't improve for 3000 rounds.\n",
"[5000]\ttraining's auc: 0.951755\tvalid_1's auc: 0.918564\n",
"[10000]\ttraining's auc: 0.971292\tvalid_1's auc: 0.9206\n",
"Early stopping, best iteration is:\n",
"[10053]\ttraining's auc: 0.971454\tvalid_1's auc: 0.92061\n",
"Fold 3\n",
"Training until validation scores don't improve for 3000 rounds.\n",
"[5000]\ttraining's auc: 0.951564\tvalid_1's auc: 0.920813\n",
"[10000]\ttraining's auc: 0.970987\tvalid_1's auc: 0.922464\n",
"Early stopping, best iteration is:\n",
"[11860]\ttraining's auc: 0.97652\tvalid_1's auc: 0.922573\n",
"Fold 4\n",
"Training until validation scores don't improve for 3000 rounds.\n",
"[5000]\ttraining's auc: 0.952044\tvalid_1's auc: 0.917226\n",
"[10000]\ttraining's auc: 0.97132\tvalid_1's auc: 0.91916\n",
"Early stopping, best iteration is:\n",
"[9968]\ttraining's auc: 0.971221\tvalid_1's auc: 0.919185\n"
]
}
],
"source": [
"for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train.values, target.values)):\n",
" print(\"Fold {}\".format(fold_))\n",
" trn_data = lgb.Dataset(df_train.iloc[trn_idx][features], label=target.iloc[trn_idx])\n",
" val_data = lgb.Dataset(df_train.iloc[val_idx][features], label=target.iloc[val_idx])\n",
" clf = lgb.train(lgb_params, trn_data, 100000, valid_sets = [trn_data, val_data], verbose_eval=5000, early_stopping_rounds = 3000)\n",
" oof[val_idx] = clf.predict(df_train.iloc[val_idx][features], num_iteration=clf.best_iteration)\n",
" val_aucs.append(roc_auc_score(target[val_idx] , oof[val_idx] ))\n",
" predictions += clf.predict(df_test2[features], num_iteration=clf.best_iteration) / folds.n_splits\n",
" fold_importance_df = pd.DataFrame()\n",
" fold_importance_df[\"feature\"] = features\n",
" fold_importance_df[\"importance\"] = clf.feature_importance()\n",
" fold_importance_df[\"fold\"] = fold_ + 1\n",
" feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mean auc: 0.922101029, std: 0.002064024. All auc: 0.922096481.\n"
]
}
],
"source": [
"mean_auc = np.mean(val_aucs)\n",
"std_auc = np.std(val_aucs)\n",
"all_auc = roc_auc_score(target, oof)\n",
"print(\"Mean auc: %.9f, std: %.9f. All auc: %.9f.\" % (mean_auc, std_auc, all_auc))"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"subreal = pd.DataFrame({\"ID_code\":df_test2[\"ID_code\"].values})\n",
"subreal['target']=predictions\n",
"sub = pd.DataFrame({\"ID_code\": df_test.ID_code.values})"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"finalsub = sub.set_index('ID_code').join(subreal.set_index('ID_code')).reset_index()\n",
"finalsub.fillna(0,inplace=True)\n",
"finalsub.to_csv(\"output/submission.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment