aicrowd-bot/036c942b-b6bd-4fba-8e06-dbda65dea240.ipynb

## 036c942b-b6bd-4fba-8e06-dbda65dea240.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-05-04T11:07:16.404707Z",
     "iopub.status.busy": "2021-05-04T11:07:16.392205Z",
     "iopub.status.idle": "2021-05-04T11:07:17.522413Z",
     "shell.execute_reply": "2021-05-04T11:07:17.521622Z"
    },
    "papermill": {
     "duration": 1.158665,
     "end_time": "2021-05-04T11:07:17.522573",
     "exception": false,
     "start_time": "2021-05-04T11:07:16.363908",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "!cp -r ../input/addisamples ds_shared_drive"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-05-04T11:07:17.564472Z",
     "iopub.status.busy": "2021-05-04T11:07:17.560621Z",
     "iopub.status.idle": "2021-05-04T11:07:18.287784Z",
     "shell.execute_reply": "2021-05-04T11:07:18.287200Z"
    },
    "papermill": {
     "duration": 0.74864,
     "end_time": "2021-05-04T11:07:18.287925",
     "exception": false,
     "start_time": "2021-05-04T11:07:17.539285",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "!mkdir -p assets"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.016122,
     "end_time": "2021-05-04T11:07:18.320427",
     "exception": false,
     "start_time": "2021-05-04T11:07:18.304305",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "# Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-05-04T11:07:18.359185Z",
     "iopub.status.busy": "2021-05-04T11:07:18.358280Z",
     "iopub.status.idle": "2021-05-04T11:07:18.361270Z",
     "shell.execute_reply": "2021-05-04T11:07:18.361717Z"
    },
    "papermill": {
     "duration": 0.025337,
     "end_time": "2021-05-04T11:07:18.361887",
     "exception": false,
     "start_time": "2021-05-04T11:07:18.336550",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "# Please use the absolute for the location of the dataset.\n",
    "# Or you can use relative path with `os.getcwd() + \"test_data/validation.csv\"`\n",
    "AICROWD_DATASET_PATH = os.getenv(\"DATASET_PATH\", \"ds_shared_drive/train.csv\")\n",
    "AICROWD_PREDICTIONS_PATH = os.getenv(\"PREDICTIONS_PATH\", \"predictions.csv\")\n",
    "AICROWD_ASSETS_DIR = \"assets\"\n",
    "AICROWD_API_KEY = \"\" # Get your key from https://www.aicrowd.com/participants/me"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-05-04T11:07:18.402596Z",
     "iopub.status.busy": "2021-05-04T11:07:18.401940Z",
     "iopub.status.idle": "2021-05-04T11:07:19.350933Z",
     "shell.execute_reply": "2021-05-04T11:07:19.351671Z"
    },
    "papermill": {
     "duration": 0.973501,
     "end_time": "2021-05-04T11:07:19.351868",
     "exception": false,
     "start_time": "2021-05-04T11:07:18.378367",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import numpy as np # linear algebra\n",
    "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
    "\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "sns.set()\n",
    "\n",
    "pd.set_option('display.max_rows', 500)\n",
    "pd.set_option('display.max_columns', 500)\n",
    "pd.set_option('display.width', 1000)\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-05-04T11:07:19.396788Z",
     "iopub.status.busy": "2021-05-04T11:07:19.395765Z",
     "iopub.status.idle": "2021-05-04T11:07:20.071584Z",
     "shell.execute_reply": "2021-05-04T11:07:20.070493Z"
    },
    "papermill": {
     "duration": 0.70269,
     "end_time": "2021-05-04T11:07:20.071774",
     "exception": false,
     "start_time": "2021-05-04T11:07:19.369084",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "target_col = \"diagnosis\"\n",
    "key_col = \"row_id\"\n",
    "cat_cols = ['intersection_pos_rel_centre']\n",
    "seed = 2021\n",
    "\n",
    "target_values = [\"normal\", \"post_alzheimer\", \"pre_alzheimer\"]\n",
    "\n",
    "train = pd.read_csv(AICROWD_DATASET_PATH)\n",
    "train = train[train[target_col].isin(target_values)].copy().reset_index(drop=True)\n",
    "\n",
    "\n",
    "print(train.shape)\n",
    "features = train.columns[1:-1].to_list()\n",
    "\n",
    "numeric_features = [c for c in features if c not in cat_cols]\n",
    "for c in numeric_features:\n",
    "    train[c] = train[c].astype(float)\n",
    "\n",
    "print(train[target_col].value_counts())\n",
    "train.tail(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.018283,
     "end_time": "2021-05-04T11:07:20.108748",
     "exception": false,
     "start_time": "2021-05-04T11:07:20.090465",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "## Target"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-05-04T11:07:20.165457Z",
     "iopub.status.busy": "2021-05-04T11:07:20.164718Z",
     "iopub.status.idle": "2021-05-04T11:07:20.357428Z",
     "shell.execute_reply": "2021-05-04T11:07:20.357904Z"
    },
    "papermill": {
     "duration": 0.230886,
     "end_time": "2021-05-04T11:07:20.358084",
     "exception": false,
     "start_time": "2021-05-04T11:07:20.127198",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "sns.countplot(x=target_col, data=train);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.018745,
     "end_time": "2021-05-04T11:07:20.396308",
     "exception": false,
     "start_time": "2021-05-04T11:07:20.377563",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "## Numerical features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-05-04T11:07:20.438665Z",
     "iopub.status.busy": "2021-05-04T11:07:20.437987Z",
     "iopub.status.idle": "2021-05-04T11:08:29.318670Z",
     "shell.execute_reply": "2021-05-04T11:08:29.317715Z"
    },
    "papermill": {
     "duration": 68.903519,
     "end_time": "2021-05-04T11:08:29.318826",
     "exception": false,
     "start_time": "2021-05-04T11:07:20.415307",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "nb_shown = len(numeric_features)\n",
    "fig, ax = plt.subplots(nb_shown, 1, figsize=(20,5*nb_shown))\n",
    "\n",
    "colors = [\"Green\", \"Blue\", \"Red\"]\n",
    "for i, col in enumerate(numeric_features[:nb_shown]):\n",
    "    for value, color in zip(target_values, colors):\n",
    "        sns.distplot(train.loc[train[target_col]==value, col], \n",
    "                     ax=ax[i], color=color, norm_hist=True)\n",
    "        ax[i].set_title(\"Train {}\".format(col))\n",
    "    ax[i].set_xlabel(\"\")\n",
    "    ax[i].set_xlabel(\"\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.147845,
     "end_time": "2021-05-04T11:08:29.616065",
     "exception": false,
     "start_time": "2021-05-04T11:08:29.468220",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "## Categorical features\n",
    "There is only 1 single categorical feature"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-05-04T11:08:29.919719Z",
     "iopub.status.busy": "2021-05-04T11:08:29.918894Z",
     "iopub.status.idle": "2021-05-04T11:08:30.210596Z",
     "shell.execute_reply": "2021-05-04T11:08:30.210029Z"
    },
    "papermill": {
     "duration": 0.446772,
     "end_time": "2021-05-04T11:08:30.210751",
     "exception": false,
     "start_time": "2021-05-04T11:08:29.763979",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "sns.countplot(x=cat_cols[0], hue=target_col, data=train[cat_cols+[target_col]].fillna(\"NA\"));"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-05-04T11:08:30.523489Z",
     "iopub.status.busy": "2021-05-04T11:08:30.517349Z",
     "iopub.status.idle": "2021-05-04T11:08:30.768144Z",
     "shell.execute_reply": "2021-05-04T11:08:30.767456Z"
    },
    "papermill": {
     "duration": 0.408804,
     "end_time": "2021-05-04T11:08:30.768280",
     "exception": false,
     "start_time": "2021-05-04T11:08:30.359476",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "df_pos = train[train[target_col].isin(target_values[1:])]\n",
    "nb_pos = df_pos.shape[0]\n",
    "nb_neg = nb_pos\n",
    "df_neg = train[train[target_col] == \"normal\"].sample(n=nb_neg, random_state=seed)\n",
    "df_samples = pd.concat([df_pos, df_neg]).sample(frac=1).reset_index(drop=True)\n",
    "\n",
    "sns.countplot(x=cat_cols[0], hue=target_col, data=df_samples[cat_cols+[target_col]].fillna(\"NA\"));"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.149623,
     "end_time": "2021-05-04T11:08:31.067408",
     "exception": false,
     "start_time": "2021-05-04T11:08:30.917785",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "# Baseline\n",
    "Because of the imbalance dataset, I will use the balanced one to create the baseline. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-05-04T11:08:31.380448Z",
     "iopub.status.busy": "2021-05-04T11:08:31.379740Z",
     "iopub.status.idle": "2021-05-04T11:08:31.510144Z",
     "shell.execute_reply": "2021-05-04T11:08:31.509462Z"
    },
    "papermill": {
     "duration": 0.292522,
     "end_time": "2021-05-04T11:08:31.510296",
     "exception": false,
     "start_time": "2021-05-04T11:08:31.217774",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "print(cat_cols)\n",
    "for c in cat_cols:\n",
    "    df_samples[c].fillna(\"NA\", inplace=True)\n",
    "    \n",
    "df_dummies = pd.get_dummies(df_samples[cat_cols], columns=cat_cols, dummy_na=True).add_prefix('CAT_')\n",
    "dummy_cols = df_dummies.columns.to_list()\n",
    "print(dummy_cols)\n",
    "\n",
    "df_samples = pd.concat([df_samples, df_dummies], axis=1)\n",
    "df_samples['cnt_NaN'] = df_samples[numeric_features].isna().sum(axis=1)\n",
    "\n",
    "df_samples.fillna(-1, inplace=True)\n",
    "df_samples.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-05-04T11:08:31.822535Z",
     "iopub.status.busy": "2021-05-04T11:08:31.821394Z",
     "iopub.status.idle": "2021-05-04T11:08:31.849740Z",
     "shell.execute_reply": "2021-05-04T11:08:31.849135Z"
    },
    "papermill": {
     "duration": 0.187767,
     "end_time": "2021-05-04T11:08:31.849879",
     "exception": false,
     "start_time": "2021-05-04T11:08:31.662112",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "model_features = df_samples.columns.to_list()\n",
    "model_features = [c for c in model_features if c not in [key_col, target_col] + cat_cols]\n",
    "\n",
    "unique_value_cols = []\n",
    "for c in model_features:\n",
    "    if df_samples[c].unique().shape[0] == 1:\n",
    "        unique_value_cols.append(c)\n",
    "        \n",
    "print(unique_value_cols)\n",
    "model_features = [c for c in model_features if c not in unique_value_cols]\n",
    "print(len(model_features))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-05-04T11:08:32.164878Z",
     "iopub.status.busy": "2021-05-04T11:08:32.164098Z",
     "iopub.status.idle": "2021-05-04T11:08:33.405424Z",
     "shell.execute_reply": "2021-05-04T11:08:33.404724Z"
    },
    "papermill": {
     "duration": 1.404616,
     "end_time": "2021-05-04T11:08:33.405562",
     "exception": false,
     "start_time": "2021-05-04T11:08:32.000946",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import lightgbm as lgb\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "\n",
    "X_train = df_samples[model_features]\n",
    "y_train = df_samples[target_col].map(dict(zip(target_values, list(range(len(target_values))))))\n",
    "\n",
    "X_test = df_samples[model_features]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-05-04T11:08:33.715488Z",
     "iopub.status.busy": "2021-05-04T11:08:33.714549Z",
     "iopub.status.idle": "2021-05-04T11:08:45.237648Z",
     "shell.execute_reply": "2021-05-04T11:08:45.242262Z"
    },
    "papermill": {
     "duration": 11.685539,
     "end_time": "2021-05-04T11:08:45.242603",
     "exception": false,
     "start_time": "2021-05-04T11:08:33.557064",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "skf = StratifiedKFold(n_splits=5, random_state=2021, shuffle=True)\n",
    "preds = 0.0\n",
    "\n",
    "params = {\n",
    "          \"objective\" : \"multiclass\",\n",
    "          \"num_class\" : len(target_values),\n",
    "          \"bagging_seed\" : 2021,\n",
    "          \"verbosity\" : 1 }\n",
    "\n",
    "clfs = []\n",
    "for fold, (itrain, ivalid) in enumerate(skf.split(X_train, y_train)):\n",
    "    print(\"-\"*40)\n",
    "    print(f\"Running for fold {fold}\")\n",
    "    lgb_train = lgb.Dataset(X_train.iloc[itrain], y_train.iloc[itrain])\n",
    "    lgb_eval  = lgb.Dataset(X_train.iloc[ivalid], y_train.iloc[ivalid], reference = lgb_train)\n",
    "    clf = lgb.train(params, lgb_train, 1000, valid_sets=[lgb_eval], \n",
    "                    early_stopping_rounds=100, verbose_eval=200)\n",
    "\n",
    "    clfs.append(clf)\n",
    "    pred = clf.predict(X_test)\n",
    "    preds += pred/skf.n_splits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-05-04T11:08:45.599328Z",
     "iopub.status.busy": "2021-05-04T11:08:45.597803Z",
     "iopub.status.idle": "2021-05-04T11:08:46.008833Z",
     "shell.execute_reply": "2021-05-04T11:08:46.009377Z"
    },
    "papermill": {
     "duration": 0.595883,
     "end_time": "2021-05-04T11:08:46.009566",
     "exception": false,
     "start_time": "2021-05-04T11:08:45.413683",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "lgb.plot_importance(clf, max_num_features=20);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-05-04T11:08:46.331978Z",
     "iopub.status.busy": "2021-05-04T11:08:46.330935Z",
     "iopub.status.idle": "2021-05-04T11:08:46.363726Z",
     "shell.execute_reply": "2021-05-04T11:08:46.364366Z"
    },
    "papermill": {
     "duration": 0.195863,
     "end_time": "2021-05-04T11:08:46.364563",
     "exception": false,
     "start_time": "2021-05-04T11:08:46.168700",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import joblib\n",
    "for i, clf in enumerate(clfs):\n",
    "    model_filename = f'{AICROWD_ASSETS_DIR}/model_lgb_fold_{i}.pkl'\n",
    "    joblib.dump(clf, model_filename)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.158194,
     "end_time": "2021-05-04T11:08:46.681169",
     "exception": false,
     "start_time": "2021-05-04T11:08:46.522975",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "## Prediction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-05-04T11:08:47.071316Z",
     "iopub.status.busy": "2021-05-04T11:08:47.037267Z",
     "iopub.status.idle": "2021-05-04T11:08:47.158382Z",
     "shell.execute_reply": "2021-05-04T11:08:47.157836Z"
    },
    "papermill": {
     "duration": 0.31786,
     "end_time": "2021-05-04T11:08:47.158529",
     "exception": false,
     "start_time": "2021-05-04T11:08:46.840669",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "df_test = train.sample(n=100)\n",
    "    \n",
    "for c in numeric_features:\n",
    "    df_test[c] = df_test[c].astype(float)\n",
    "    \n",
    "for c in cat_cols:\n",
    "    df_test[c].fillna(\"NA\", inplace=True)\n",
    "    \n",
    "df_test_dummies = pd.get_dummies(df_test[cat_cols], columns=cat_cols, dummy_na=True).add_prefix('CAT_')\n",
    "df_test = pd.concat([df_test, df_test_dummies], axis=1)\n",
    "df_test['cnt_NaN'] = df_test[numeric_features].isna().sum(axis=1)\n",
    "\n",
    "df_test.fillna(-1, inplace=True)\n",
    "\n",
    "for c in dummy_cols:\n",
    "    if c not in df_test.columns:\n",
    "        df_test[c] = 0\n",
    "\n",
    "print(\"Missing columns:\", [c for c in model_features if c not in df_test.columns])\n",
    "df_test.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-05-04T11:08:47.489364Z",
     "iopub.status.busy": "2021-05-04T11:08:47.488466Z",
     "iopub.status.idle": "2021-05-04T11:08:47.534274Z",
     "shell.execute_reply": "2021-05-04T11:08:47.535228Z"
    },
    "papermill": {
     "duration": 0.215922,
     "end_time": "2021-05-04T11:08:47.535430",
     "exception": false,
     "start_time": "2021-05-04T11:08:47.319508",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import joblib\n",
    "\n",
    "X_test = df_test[model_features]\n",
    "\n",
    "preds = 0.0\n",
    "nb_folds = 5 # skf.n_splits\n",
    "for fold in range(nb_folds):\n",
    "    print(\"-\"*40)\n",
    "    print(f\"Running for fold {fold}\")\n",
    "    model_filename = f'{AICROWD_ASSETS_DIR}/model_lgb_fold_{i}.pkl'\n",
    "    \n",
    "    clf = joblib.load(model_filename)\n",
    "    pred = clf.predict(X_test)\n",
    "    preds += pred/nb_folds\n",
    "    \n",
    "print(preds.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-05-04T11:08:47.893006Z",
     "iopub.status.busy": "2021-05-04T11:08:47.888741Z",
     "iopub.status.idle": "2021-05-04T11:08:48.174692Z",
     "shell.execute_reply": "2021-05-04T11:08:48.174030Z"
    },
    "papermill": {
     "duration": 0.478602,
     "end_time": "2021-05-04T11:08:48.174837",
     "exception": false,
     "start_time": "2021-05-04T11:08:47.696235",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "print(preds.min(), preds.max())\n",
    "for i, (value, color) in enumerate(zip(target_values, colors)):\n",
    "    sns.distplot(preds[:, i], color=color, norm_hist=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.160153,
     "end_time": "2021-05-04T11:08:48.495111",
     "exception": false,
     "start_time": "2021-05-04T11:08:48.334958",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "# End"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  },
  "papermill": {
   "default_parameters": {},
   "duration": 103.180567,
   "end_time": "2021-05-04T11:08:50.768320",
   "environment_variables": {},
   "exception": null,
   "input_path": "__notebook__.ipynb",
   "output_path": "__notebook__.ipynb",
   "parameters": {},
   "start_time": "2021-05-04T11:07:07.587753",
   "version": "2.3.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"execution": {
	"iopub.execute_input": "2021-05-04T11:07:16.404707Z",
	"iopub.status.busy": "2021-05-04T11:07:16.392205Z",
	"iopub.status.idle": "2021-05-04T11:07:17.522413Z",
	"shell.execute_reply": "2021-05-04T11:07:17.521622Z"
	},
	"papermill": {
	"duration": 1.158665,
	"end_time": "2021-05-04T11:07:17.522573",
	"exception": false,
	"start_time": "2021-05-04T11:07:16.363908",
	"status": "completed"
	},
	"tags": []
	},
	"outputs": [],
	"source": [
	"!cp -r ../input/addisamples ds_shared_drive"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"execution": {
	"iopub.execute_input": "2021-05-04T11:07:17.564472Z",
	"iopub.status.busy": "2021-05-04T11:07:17.560621Z",
	"iopub.status.idle": "2021-05-04T11:07:18.287784Z",
	"shell.execute_reply": "2021-05-04T11:07:18.287200Z"
	},
	"papermill": {
	"duration": 0.74864,
	"end_time": "2021-05-04T11:07:18.287925",
	"exception": false,
	"start_time": "2021-05-04T11:07:17.539285",
	"status": "completed"
	},
	"tags": []
	},
	"outputs": [],
	"source": [
	"!mkdir -p assets"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"papermill": {
	"duration": 0.016122,
	"end_time": "2021-05-04T11:07:18.320427",
	"exception": false,
	"start_time": "2021-05-04T11:07:18.304305",
	"status": "completed"
	},
	"tags": []
	},
	"source": [
	"# Data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"execution": {
	"iopub.execute_input": "2021-05-04T11:07:18.359185Z",
	"iopub.status.busy": "2021-05-04T11:07:18.358280Z",
	"iopub.status.idle": "2021-05-04T11:07:18.361270Z",
	"shell.execute_reply": "2021-05-04T11:07:18.361717Z"
	},
	"papermill": {
	"duration": 0.025337,
	"end_time": "2021-05-04T11:07:18.361887",
	"exception": false,
	"start_time": "2021-05-04T11:07:18.336550",
	"status": "completed"
	},
	"tags": []
	},
	"outputs": [],
	"source": [
	"import os\n",
	"\n",
	"# Please use the absolute for the location of the dataset.\n",
	"# Or you can use relative path with `os.getcwd() + \"test_data/validation.csv\"`\n",
	"AICROWD_DATASET_PATH = os.getenv(\"DATASET_PATH\", \"ds_shared_drive/train.csv\")\n",
	"AICROWD_PREDICTIONS_PATH = os.getenv(\"PREDICTIONS_PATH\", \"predictions.csv\")\n",
	"AICROWD_ASSETS_DIR = \"assets\"\n",
	"AICROWD_API_KEY = \"\" # Get your key from https://www.aicrowd.com/participants/me"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"execution": {
	"iopub.execute_input": "2021-05-04T11:07:18.402596Z",
	"iopub.status.busy": "2021-05-04T11:07:18.401940Z",
	"iopub.status.idle": "2021-05-04T11:07:19.350933Z",
	"shell.execute_reply": "2021-05-04T11:07:19.351671Z"
	},
	"papermill": {
	"duration": 0.973501,
	"end_time": "2021-05-04T11:07:19.351868",
	"exception": false,
	"start_time": "2021-05-04T11:07:18.378367",
	"status": "completed"
	},
	"tags": []
	},
	"outputs": [],
	"source": [
	"import numpy as np # linear algebra\n",
	"import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
	"\n",
	"import seaborn as sns\n",
	"import matplotlib.pyplot as plt\n",
	"%matplotlib inline\n",
	"sns.set()\n",
	"\n",
	"pd.set_option('display.max_rows', 500)\n",
	"pd.set_option('display.max_columns', 500)\n",
	"pd.set_option('display.width', 1000)\n",
	"\n",
	"import warnings\n",
	"warnings.filterwarnings(\"ignore\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"execution": {
	"iopub.execute_input": "2021-05-04T11:07:19.396788Z",
	"iopub.status.busy": "2021-05-04T11:07:19.395765Z",
	"iopub.status.idle": "2021-05-04T11:07:20.071584Z",
	"shell.execute_reply": "2021-05-04T11:07:20.070493Z"
	},
	"papermill": {
	"duration": 0.70269,
	"end_time": "2021-05-04T11:07:20.071774",
	"exception": false,
	"start_time": "2021-05-04T11:07:19.369084",
	"status": "completed"
	},
	"tags": []
	},
	"outputs": [],
	"source": [
	"target_col = \"diagnosis\"\n",
	"key_col = \"row_id\"\n",
	"cat_cols = ['intersection_pos_rel_centre']\n",
	"seed = 2021\n",
	"\n",
	"target_values = [\"normal\", \"post_alzheimer\", \"pre_alzheimer\"]\n",
	"\n",
	"train = pd.read_csv(AICROWD_DATASET_PATH)\n",
	"train = train[train[target_col].isin(target_values)].copy().reset_index(drop=True)\n",
	"\n",
	"\n",
	"print(train.shape)\n",
	"features = train.columns[1:-1].to_list()\n",
	"\n",
	"numeric_features = [c for c in features if c not in cat_cols]\n",
	"for c in numeric_features:\n",
	" train[c] = train[c].astype(float)\n",
	"\n",
	"print(train[target_col].value_counts())\n",
	"train.tail(3)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"papermill": {
	"duration": 0.018283,
	"end_time": "2021-05-04T11:07:20.108748",
	"exception": false,
	"start_time": "2021-05-04T11:07:20.090465",
	"status": "completed"
	},
	"tags": []
	},
	"source": [
	"## Target"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"execution": {
	"iopub.execute_input": "2021-05-04T11:07:20.165457Z",
	"iopub.status.busy": "2021-05-04T11:07:20.164718Z",
	"iopub.status.idle": "2021-05-04T11:07:20.357428Z",
	"shell.execute_reply": "2021-05-04T11:07:20.357904Z"
	},
	"papermill": {
	"duration": 0.230886,
	"end_time": "2021-05-04T11:07:20.358084",
	"exception": false,
	"start_time": "2021-05-04T11:07:20.127198",
	"status": "completed"
	},
	"tags": []
	},
	"outputs": [],
	"source": [
	"sns.countplot(x=target_col, data=train);"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"papermill": {
	"duration": 0.018745,
	"end_time": "2021-05-04T11:07:20.396308",
	"exception": false,
	"start_time": "2021-05-04T11:07:20.377563",
	"status": "completed"
	},
	"tags": []
	},
	"source": [
	"## Numerical features"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"execution": {
	"iopub.execute_input": "2021-05-04T11:07:20.438665Z",
	"iopub.status.busy": "2021-05-04T11:07:20.437987Z",
	"iopub.status.idle": "2021-05-04T11:08:29.318670Z",
	"shell.execute_reply": "2021-05-04T11:08:29.317715Z"
	},
	"papermill": {
	"duration": 68.903519,
	"end_time": "2021-05-04T11:08:29.318826",
	"exception": false,
	"start_time": "2021-05-04T11:07:20.415307",
	"status": "completed"
	},
	"tags": []
	},
	"outputs": [],
	"source": [
	"nb_shown = len(numeric_features)\n",
	"fig, ax = plt.subplots(nb_shown, 1, figsize=(20,5*nb_shown))\n",
	"\n",
	"colors = [\"Green\", \"Blue\", \"Red\"]\n",
	"for i, col in enumerate(numeric_features[:nb_shown]):\n",
	" for value, color in zip(target_values, colors):\n",
	" sns.distplot(train.loc[train[target_col]==value, col], \n",
	" ax=ax[i], color=color, norm_hist=True)\n",
	" ax[i].set_title(\"Train {}\".format(col))\n",
	" ax[i].set_xlabel(\"\")\n",
	" ax[i].set_xlabel(\"\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"papermill": {
	"duration": 0.147845,
	"end_time": "2021-05-04T11:08:29.616065",
	"exception": false,
	"start_time": "2021-05-04T11:08:29.468220",
	"status": "completed"
	},
	"tags": []
	},
	"source": [
	"## Categorical features\n",
	"There is only 1 single categorical feature"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"execution": {
	"iopub.execute_input": "2021-05-04T11:08:29.919719Z",
	"iopub.status.busy": "2021-05-04T11:08:29.918894Z",
	"iopub.status.idle": "2021-05-04T11:08:30.210596Z",
	"shell.execute_reply": "2021-05-04T11:08:30.210029Z"
	},
	"papermill": {
	"duration": 0.446772,
	"end_time": "2021-05-04T11:08:30.210751",
	"exception": false,
	"start_time": "2021-05-04T11:08:29.763979",
	"status": "completed"
	},
	"tags": []
	},
	"outputs": [],
	"source": [
	"sns.countplot(x=cat_cols[0], hue=target_col, data=train[cat_cols+[target_col]].fillna(\"NA\"));"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"execution": {
	"iopub.execute_input": "2021-05-04T11:08:30.523489Z",
	"iopub.status.busy": "2021-05-04T11:08:30.517349Z",
	"iopub.status.idle": "2021-05-04T11:08:30.768144Z",
	"shell.execute_reply": "2021-05-04T11:08:30.767456Z"
	},
	"papermill": {
	"duration": 0.408804,
	"end_time": "2021-05-04T11:08:30.768280",
	"exception": false,
	"start_time": "2021-05-04T11:08:30.359476",
	"status": "completed"
	},
	"tags": []
	},
	"outputs": [],
	"source": [
	"df_pos = train[train[target_col].isin(target_values[1:])]\n",
	"nb_pos = df_pos.shape[0]\n",
	"nb_neg = nb_pos\n",
	"df_neg = train[train[target_col] == \"normal\"].sample(n=nb_neg, random_state=seed)\n",
	"df_samples = pd.concat([df_pos, df_neg]).sample(frac=1).reset_index(drop=True)\n",
	"\n",
	"sns.countplot(x=cat_cols[0], hue=target_col, data=df_samples[cat_cols+[target_col]].fillna(\"NA\"));"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"papermill": {
	"duration": 0.149623,
	"end_time": "2021-05-04T11:08:31.067408",
	"exception": false,
	"start_time": "2021-05-04T11:08:30.917785",
	"status": "completed"
	},
	"tags": []
	},
	"source": [
	"# Baseline\n",
	"Because of the imbalance dataset, I will use the balanced one to create the baseline. "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"execution": {
	"iopub.execute_input": "2021-05-04T11:08:31.380448Z",
	"iopub.status.busy": "2021-05-04T11:08:31.379740Z",
	"iopub.status.idle": "2021-05-04T11:08:31.510144Z",
	"shell.execute_reply": "2021-05-04T11:08:31.509462Z"
	},
	"papermill": {
	"duration": 0.292522,
	"end_time": "2021-05-04T11:08:31.510296",
	"exception": false,
	"start_time": "2021-05-04T11:08:31.217774",
	"status": "completed"
	},
	"tags": []
	},
	"outputs": [],
	"source": [
	"print(cat_cols)\n",
	"for c in cat_cols:\n",
	" df_samples[c].fillna(\"NA\", inplace=True)\n",
	" \n",
	"df_dummies = pd.get_dummies(df_samples[cat_cols], columns=cat_cols, dummy_na=True).add_prefix('CAT_')\n",
	"dummy_cols = df_dummies.columns.to_list()\n",
	"print(dummy_cols)\n",
	"\n",
	"df_samples = pd.concat([df_samples, df_dummies], axis=1)\n",
	"df_samples['cnt_NaN'] = df_samples[numeric_features].isna().sum(axis=1)\n",
	"\n",
	"df_samples.fillna(-1, inplace=True)\n",
	"df_samples.head(3)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"execution": {
	"iopub.execute_input": "2021-05-04T11:08:31.822535Z",
	"iopub.status.busy": "2021-05-04T11:08:31.821394Z",
	"iopub.status.idle": "2021-05-04T11:08:31.849740Z",
	"shell.execute_reply": "2021-05-04T11:08:31.849135Z"
	},
	"papermill": {
	"duration": 0.187767,
	"end_time": "2021-05-04T11:08:31.849879",
	"exception": false,
	"start_time": "2021-05-04T11:08:31.662112",
	"status": "completed"
	},
	"tags": []
	},
	"outputs": [],
	"source": [
	"model_features = df_samples.columns.to_list()\n",
	"model_features = [c for c in model_features if c not in [key_col, target_col] + cat_cols]\n",
	"\n",
	"unique_value_cols = []\n",
	"for c in model_features:\n",
	" if df_samples[c].unique().shape[0] == 1:\n",
	" unique_value_cols.append(c)\n",
	" \n",
	"print(unique_value_cols)\n",
	"model_features = [c for c in model_features if c not in unique_value_cols]\n",
	"print(len(model_features))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"execution": {
	"iopub.execute_input": "2021-05-04T11:08:32.164878Z",
	"iopub.status.busy": "2021-05-04T11:08:32.164098Z",
	"iopub.status.idle": "2021-05-04T11:08:33.405424Z",
	"shell.execute_reply": "2021-05-04T11:08:33.404724Z"
	},
	"papermill": {
	"duration": 1.404616,
	"end_time": "2021-05-04T11:08:33.405562",
	"exception": false,
	"start_time": "2021-05-04T11:08:32.000946",
	"status": "completed"
	},
	"tags": []
	},
	"outputs": [],
	"source": [
	"import lightgbm as lgb\n",
	"from sklearn.model_selection import train_test_split\n",
	"from sklearn.model_selection import StratifiedKFold\n",
	"\n",
	"X_train = df_samples[model_features]\n",
	"y_train = df_samples[target_col].map(dict(zip(target_values, list(range(len(target_values))))))\n",
	"\n",
	"X_test = df_samples[model_features]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"execution": {
	"iopub.execute_input": "2021-05-04T11:08:33.715488Z",
	"iopub.status.busy": "2021-05-04T11:08:33.714549Z",
	"iopub.status.idle": "2021-05-04T11:08:45.237648Z",
	"shell.execute_reply": "2021-05-04T11:08:45.242262Z"
	},
	"papermill": {
	"duration": 11.685539,
	"end_time": "2021-05-04T11:08:45.242603",
	"exception": false,
	"start_time": "2021-05-04T11:08:33.557064",
	"status": "completed"
	},
	"tags": []
	},
	"outputs": [],
	"source": [
	"skf = StratifiedKFold(n_splits=5, random_state=2021, shuffle=True)\n",
	"preds = 0.0\n",
	"\n",
	"params = {\n",
	" \"objective\" : \"multiclass\",\n",
	" \"num_class\" : len(target_values),\n",
	" \"bagging_seed\" : 2021,\n",
	" \"verbosity\" : 1 }\n",
	"\n",
	"clfs = []\n",
	"for fold, (itrain, ivalid) in enumerate(skf.split(X_train, y_train)):\n",
	" print(\"-\"*40)\n",
	" print(f\"Running for fold {fold}\")\n",
	" lgb_train = lgb.Dataset(X_train.iloc[itrain], y_train.iloc[itrain])\n",
	" lgb_eval = lgb.Dataset(X_train.iloc[ivalid], y_train.iloc[ivalid], reference = lgb_train)\n",
	" clf = lgb.train(params, lgb_train, 1000, valid_sets=[lgb_eval], \n",
	" early_stopping_rounds=100, verbose_eval=200)\n",
	"\n",
	" clfs.append(clf)\n",
	" pred = clf.predict(X_test)\n",
	" preds += pred/skf.n_splits"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"execution": {
	"iopub.execute_input": "2021-05-04T11:08:45.599328Z",
	"iopub.status.busy": "2021-05-04T11:08:45.597803Z",
	"iopub.status.idle": "2021-05-04T11:08:46.008833Z",
	"shell.execute_reply": "2021-05-04T11:08:46.009377Z"
	},
	"papermill": {
	"duration": 0.595883,
	"end_time": "2021-05-04T11:08:46.009566",
	"exception": false,
	"start_time": "2021-05-04T11:08:45.413683",
	"status": "completed"
	},
	"tags": []
	},
	"outputs": [],
	"source": [
	"lgb.plot_importance(clf, max_num_features=20);"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"execution": {
	"iopub.execute_input": "2021-05-04T11:08:46.331978Z",
	"iopub.status.busy": "2021-05-04T11:08:46.330935Z",
	"iopub.status.idle": "2021-05-04T11:08:46.363726Z",
	"shell.execute_reply": "2021-05-04T11:08:46.364366Z"
	},
	"papermill": {
	"duration": 0.195863,
	"end_time": "2021-05-04T11:08:46.364563",
	"exception": false,
	"start_time": "2021-05-04T11:08:46.168700",
	"status": "completed"
	},
	"tags": []
	},
	"outputs": [],
	"source": [
	"import joblib\n",
	"for i, clf in enumerate(clfs):\n",
	" model_filename = f'{AICROWD_ASSETS_DIR}/model_lgb_fold_{i}.pkl'\n",
	" joblib.dump(clf, model_filename)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"papermill": {
	"duration": 0.158194,
	"end_time": "2021-05-04T11:08:46.681169",
	"exception": false,
	"start_time": "2021-05-04T11:08:46.522975",
	"status": "completed"
	},
	"tags": []
	},
	"source": [
	"## Prediction"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"execution": {
	"iopub.execute_input": "2021-05-04T11:08:47.071316Z",
	"iopub.status.busy": "2021-05-04T11:08:47.037267Z",
	"iopub.status.idle": "2021-05-04T11:08:47.158382Z",
	"shell.execute_reply": "2021-05-04T11:08:47.157836Z"
	},
	"papermill": {
	"duration": 0.31786,
	"end_time": "2021-05-04T11:08:47.158529",
	"exception": false,
	"start_time": "2021-05-04T11:08:46.840669",
	"status": "completed"
	},
	"tags": []
	},
	"outputs": [],
	"source": [
	"df_test = train.sample(n=100)\n",
	" \n",
	"for c in numeric_features:\n",
	" df_test[c] = df_test[c].astype(float)\n",
	" \n",
	"for c in cat_cols:\n",
	" df_test[c].fillna(\"NA\", inplace=True)\n",
	" \n",
	"df_test_dummies = pd.get_dummies(df_test[cat_cols], columns=cat_cols, dummy_na=True).add_prefix('CAT_')\n",
	"df_test = pd.concat([df_test, df_test_dummies], axis=1)\n",
	"df_test['cnt_NaN'] = df_test[numeric_features].isna().sum(axis=1)\n",
	"\n",
	"df_test.fillna(-1, inplace=True)\n",
	"\n",
	"for c in dummy_cols:\n",
	" if c not in df_test.columns:\n",
	" df_test[c] = 0\n",
	"\n",
	"print(\"Missing columns:\", [c for c in model_features if c not in df_test.columns])\n",
	"df_test.head(3)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"execution": {
	"iopub.execute_input": "2021-05-04T11:08:47.489364Z",
	"iopub.status.busy": "2021-05-04T11:08:47.488466Z",
	"iopub.status.idle": "2021-05-04T11:08:47.534274Z",
	"shell.execute_reply": "2021-05-04T11:08:47.535228Z"
	},
	"papermill": {
	"duration": 0.215922,
	"end_time": "2021-05-04T11:08:47.535430",
	"exception": false,
	"start_time": "2021-05-04T11:08:47.319508",
	"status": "completed"
	},
	"tags": []
	},
	"outputs": [],
	"source": [
	"import joblib\n",
	"\n",
	"X_test = df_test[model_features]\n",
	"\n",
	"preds = 0.0\n",
	"nb_folds = 5 # skf.n_splits\n",
	"for fold in range(nb_folds):\n",
	" print(\"-\"*40)\n",
	" print(f\"Running for fold {fold}\")\n",
	" model_filename = f'{AICROWD_ASSETS_DIR}/model_lgb_fold_{i}.pkl'\n",
	" \n",
	" clf = joblib.load(model_filename)\n",
	" pred = clf.predict(X_test)\n",
	" preds += pred/nb_folds\n",
	" \n",
	"print(preds.shape)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"execution": {
	"iopub.execute_input": "2021-05-04T11:08:47.893006Z",
	"iopub.status.busy": "2021-05-04T11:08:47.888741Z",
	"iopub.status.idle": "2021-05-04T11:08:48.174692Z",
	"shell.execute_reply": "2021-05-04T11:08:48.174030Z"
	},
	"papermill": {
	"duration": 0.478602,
	"end_time": "2021-05-04T11:08:48.174837",
	"exception": false,
	"start_time": "2021-05-04T11:08:47.696235",
	"status": "completed"
	},
	"tags": []
	},
	"outputs": [],
	"source": [
	"print(preds.min(), preds.max())\n",
	"for i, (value, color) in enumerate(zip(target_values, colors)):\n",
	" sns.distplot(preds[:, i], color=color, norm_hist=True)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"papermill": {
	"duration": 0.160153,
	"end_time": "2021-05-04T11:08:48.495111",
	"exception": false,
	"start_time": "2021-05-04T11:08:48.334958",
	"status": "completed"
	},
	"tags": []
	},
	"source": [
	"# End"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.8.5"
	},
	"papermill": {
	"default_parameters": {},
	"duration": 103.180567,
	"end_time": "2021-05-04T11:08:50.768320",
	"environment_variables": {},
	"exception": null,
	"input_path": "__notebook__.ipynb",
	"output_path": "__notebook__.ipynb",
	"parameters": {},
	"start_time": "2021-05-04T11:07:07.587753",
	"version": "2.3.3"
	},
	"toc": {
	"base_numbering": 1,
	"nav_menu": {},
	"number_sections": true,
	"sideBar": true,
	"skip_h1_title": false,
	"title_cell": "Table of Contents",
	"title_sidebar": "Contents",
	"toc_cell": false,
	"toc_position": {},
	"toc_section_display": true,
	"toc_window_display": false
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}