Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aicrowd-bot/b5ab1d907083566c9b492aa362534374 to your computer and use it in GitHub Desktop.
Save aicrowd-bot/b5ab1d907083566c9b492aa362534374 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2021-05-04T11:07:16.404707Z",
"iopub.status.busy": "2021-05-04T11:07:16.392205Z",
"iopub.status.idle": "2021-05-04T11:07:17.522413Z",
"shell.execute_reply": "2021-05-04T11:07:17.521622Z"
},
"papermill": {
"duration": 1.158665,
"end_time": "2021-05-04T11:07:17.522573",
"exception": false,
"start_time": "2021-05-04T11:07:16.363908",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"!cp -r ../input/addisamples ds_shared_drive"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2021-05-04T11:07:17.564472Z",
"iopub.status.busy": "2021-05-04T11:07:17.560621Z",
"iopub.status.idle": "2021-05-04T11:07:18.287784Z",
"shell.execute_reply": "2021-05-04T11:07:18.287200Z"
},
"papermill": {
"duration": 0.74864,
"end_time": "2021-05-04T11:07:18.287925",
"exception": false,
"start_time": "2021-05-04T11:07:17.539285",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"!mkdir -p assets"
]
},
{
"cell_type": "markdown",
"metadata": {
"papermill": {
"duration": 0.016122,
"end_time": "2021-05-04T11:07:18.320427",
"exception": false,
"start_time": "2021-05-04T11:07:18.304305",
"status": "completed"
},
"tags": []
},
"source": [
"# Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2021-05-04T11:07:18.359185Z",
"iopub.status.busy": "2021-05-04T11:07:18.358280Z",
"iopub.status.idle": "2021-05-04T11:07:18.361270Z",
"shell.execute_reply": "2021-05-04T11:07:18.361717Z"
},
"papermill": {
"duration": 0.025337,
"end_time": "2021-05-04T11:07:18.361887",
"exception": false,
"start_time": "2021-05-04T11:07:18.336550",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"import os\n",
"\n",
"# Please use the absolute for the location of the dataset.\n",
"# Or you can use relative path with `os.getcwd() + \"test_data/validation.csv\"`\n",
"AICROWD_DATASET_PATH = os.getenv(\"DATASET_PATH\", \"ds_shared_drive/train.csv\")\n",
"AICROWD_PREDICTIONS_PATH = os.getenv(\"PREDICTIONS_PATH\", \"predictions.csv\")\n",
"AICROWD_ASSETS_DIR = \"assets\"\n",
"AICROWD_API_KEY = \"\" # Get your key from https://www.aicrowd.com/participants/me"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2021-05-04T11:07:18.402596Z",
"iopub.status.busy": "2021-05-04T11:07:18.401940Z",
"iopub.status.idle": "2021-05-04T11:07:19.350933Z",
"shell.execute_reply": "2021-05-04T11:07:19.351671Z"
},
"papermill": {
"duration": 0.973501,
"end_time": "2021-05-04T11:07:19.351868",
"exception": false,
"start_time": "2021-05-04T11:07:18.378367",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"import numpy as np # linear algebra\n",
"import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
"\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"sns.set()\n",
"\n",
"pd.set_option('display.max_rows', 500)\n",
"pd.set_option('display.max_columns', 500)\n",
"pd.set_option('display.width', 1000)\n",
"\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2021-05-04T11:07:19.396788Z",
"iopub.status.busy": "2021-05-04T11:07:19.395765Z",
"iopub.status.idle": "2021-05-04T11:07:20.071584Z",
"shell.execute_reply": "2021-05-04T11:07:20.070493Z"
},
"papermill": {
"duration": 0.70269,
"end_time": "2021-05-04T11:07:20.071774",
"exception": false,
"start_time": "2021-05-04T11:07:19.369084",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"target_col = \"diagnosis\"\n",
"key_col = \"row_id\"\n",
"cat_cols = ['intersection_pos_rel_centre']\n",
"seed = 2021\n",
"\n",
"target_values = [\"normal\", \"post_alzheimer\", \"pre_alzheimer\"]\n",
"\n",
"train = pd.read_csv(AICROWD_DATASET_PATH)\n",
"train = train[train[target_col].isin(target_values)].copy().reset_index(drop=True)\n",
"\n",
"\n",
"print(train.shape)\n",
"features = train.columns[1:-1].to_list()\n",
"\n",
"numeric_features = [c for c in features if c not in cat_cols]\n",
"for c in numeric_features:\n",
" train[c] = train[c].astype(float)\n",
"\n",
"print(train[target_col].value_counts())\n",
"train.tail(3)"
]
},
{
"cell_type": "markdown",
"metadata": {
"papermill": {
"duration": 0.018283,
"end_time": "2021-05-04T11:07:20.108748",
"exception": false,
"start_time": "2021-05-04T11:07:20.090465",
"status": "completed"
},
"tags": []
},
"source": [
"## Target"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2021-05-04T11:07:20.165457Z",
"iopub.status.busy": "2021-05-04T11:07:20.164718Z",
"iopub.status.idle": "2021-05-04T11:07:20.357428Z",
"shell.execute_reply": "2021-05-04T11:07:20.357904Z"
},
"papermill": {
"duration": 0.230886,
"end_time": "2021-05-04T11:07:20.358084",
"exception": false,
"start_time": "2021-05-04T11:07:20.127198",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"sns.countplot(x=target_col, data=train);"
]
},
{
"cell_type": "markdown",
"metadata": {
"papermill": {
"duration": 0.018745,
"end_time": "2021-05-04T11:07:20.396308",
"exception": false,
"start_time": "2021-05-04T11:07:20.377563",
"status": "completed"
},
"tags": []
},
"source": [
"## Numerical features"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2021-05-04T11:07:20.438665Z",
"iopub.status.busy": "2021-05-04T11:07:20.437987Z",
"iopub.status.idle": "2021-05-04T11:08:29.318670Z",
"shell.execute_reply": "2021-05-04T11:08:29.317715Z"
},
"papermill": {
"duration": 68.903519,
"end_time": "2021-05-04T11:08:29.318826",
"exception": false,
"start_time": "2021-05-04T11:07:20.415307",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"nb_shown = len(numeric_features)\n",
"fig, ax = plt.subplots(nb_shown, 1, figsize=(20,5*nb_shown))\n",
"\n",
"colors = [\"Green\", \"Blue\", \"Red\"]\n",
"for i, col in enumerate(numeric_features[:nb_shown]):\n",
" for value, color in zip(target_values, colors):\n",
" sns.distplot(train.loc[train[target_col]==value, col], \n",
" ax=ax[i], color=color, norm_hist=True)\n",
" ax[i].set_title(\"Train {}\".format(col))\n",
" ax[i].set_xlabel(\"\")\n",
" ax[i].set_xlabel(\"\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"papermill": {
"duration": 0.147845,
"end_time": "2021-05-04T11:08:29.616065",
"exception": false,
"start_time": "2021-05-04T11:08:29.468220",
"status": "completed"
},
"tags": []
},
"source": [
"## Categorical features\n",
"There is only 1 single categorical feature"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2021-05-04T11:08:29.919719Z",
"iopub.status.busy": "2021-05-04T11:08:29.918894Z",
"iopub.status.idle": "2021-05-04T11:08:30.210596Z",
"shell.execute_reply": "2021-05-04T11:08:30.210029Z"
},
"papermill": {
"duration": 0.446772,
"end_time": "2021-05-04T11:08:30.210751",
"exception": false,
"start_time": "2021-05-04T11:08:29.763979",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"sns.countplot(x=cat_cols[0], hue=target_col, data=train[cat_cols+[target_col]].fillna(\"NA\"));"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2021-05-04T11:08:30.523489Z",
"iopub.status.busy": "2021-05-04T11:08:30.517349Z",
"iopub.status.idle": "2021-05-04T11:08:30.768144Z",
"shell.execute_reply": "2021-05-04T11:08:30.767456Z"
},
"papermill": {
"duration": 0.408804,
"end_time": "2021-05-04T11:08:30.768280",
"exception": false,
"start_time": "2021-05-04T11:08:30.359476",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"df_pos = train[train[target_col].isin(target_values[1:])]\n",
"nb_pos = df_pos.shape[0]\n",
"nb_neg = nb_pos\n",
"df_neg = train[train[target_col] == \"normal\"].sample(n=nb_neg, random_state=seed)\n",
"df_samples = pd.concat([df_pos, df_neg]).sample(frac=1).reset_index(drop=True)\n",
"\n",
"sns.countplot(x=cat_cols[0], hue=target_col, data=df_samples[cat_cols+[target_col]].fillna(\"NA\"));"
]
},
{
"cell_type": "markdown",
"metadata": {
"papermill": {
"duration": 0.149623,
"end_time": "2021-05-04T11:08:31.067408",
"exception": false,
"start_time": "2021-05-04T11:08:30.917785",
"status": "completed"
},
"tags": []
},
"source": [
"# Baseline\n",
"Because of the imbalance dataset, I will use the balanced one to create the baseline. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2021-05-04T11:08:31.380448Z",
"iopub.status.busy": "2021-05-04T11:08:31.379740Z",
"iopub.status.idle": "2021-05-04T11:08:31.510144Z",
"shell.execute_reply": "2021-05-04T11:08:31.509462Z"
},
"papermill": {
"duration": 0.292522,
"end_time": "2021-05-04T11:08:31.510296",
"exception": false,
"start_time": "2021-05-04T11:08:31.217774",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"print(cat_cols)\n",
"for c in cat_cols:\n",
" df_samples[c].fillna(\"NA\", inplace=True)\n",
" \n",
"df_dummies = pd.get_dummies(df_samples[cat_cols], columns=cat_cols, dummy_na=True).add_prefix('CAT_')\n",
"dummy_cols = df_dummies.columns.to_list()\n",
"print(dummy_cols)\n",
"\n",
"df_samples = pd.concat([df_samples, df_dummies], axis=1)\n",
"df_samples['cnt_NaN'] = df_samples[numeric_features].isna().sum(axis=1)\n",
"\n",
"df_samples.fillna(-1, inplace=True)\n",
"df_samples.head(3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2021-05-04T11:08:31.822535Z",
"iopub.status.busy": "2021-05-04T11:08:31.821394Z",
"iopub.status.idle": "2021-05-04T11:08:31.849740Z",
"shell.execute_reply": "2021-05-04T11:08:31.849135Z"
},
"papermill": {
"duration": 0.187767,
"end_time": "2021-05-04T11:08:31.849879",
"exception": false,
"start_time": "2021-05-04T11:08:31.662112",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"model_features = df_samples.columns.to_list()\n",
"model_features = [c for c in model_features if c not in [key_col, target_col] + cat_cols]\n",
"\n",
"unique_value_cols = []\n",
"for c in model_features:\n",
" if df_samples[c].unique().shape[0] == 1:\n",
" unique_value_cols.append(c)\n",
" \n",
"print(unique_value_cols)\n",
"model_features = [c for c in model_features if c not in unique_value_cols]\n",
"print(len(model_features))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2021-05-04T11:08:32.164878Z",
"iopub.status.busy": "2021-05-04T11:08:32.164098Z",
"iopub.status.idle": "2021-05-04T11:08:33.405424Z",
"shell.execute_reply": "2021-05-04T11:08:33.404724Z"
},
"papermill": {
"duration": 1.404616,
"end_time": "2021-05-04T11:08:33.405562",
"exception": false,
"start_time": "2021-05-04T11:08:32.000946",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"import lightgbm as lgb\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.model_selection import StratifiedKFold\n",
"\n",
"X_train = df_samples[model_features]\n",
"y_train = df_samples[target_col].map(dict(zip(target_values, list(range(len(target_values))))))\n",
"\n",
"X_test = df_samples[model_features]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2021-05-04T11:08:33.715488Z",
"iopub.status.busy": "2021-05-04T11:08:33.714549Z",
"iopub.status.idle": "2021-05-04T11:08:45.237648Z",
"shell.execute_reply": "2021-05-04T11:08:45.242262Z"
},
"papermill": {
"duration": 11.685539,
"end_time": "2021-05-04T11:08:45.242603",
"exception": false,
"start_time": "2021-05-04T11:08:33.557064",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"skf = StratifiedKFold(n_splits=5, random_state=2021, shuffle=True)\n",
"preds = 0.0\n",
"\n",
"params = {\n",
" \"objective\" : \"multiclass\",\n",
" \"num_class\" : len(target_values),\n",
" \"bagging_seed\" : 2021,\n",
" \"verbosity\" : 1 }\n",
"\n",
"clfs = []\n",
"for fold, (itrain, ivalid) in enumerate(skf.split(X_train, y_train)):\n",
" print(\"-\"*40)\n",
" print(f\"Running for fold {fold}\")\n",
" lgb_train = lgb.Dataset(X_train.iloc[itrain], y_train.iloc[itrain])\n",
" lgb_eval = lgb.Dataset(X_train.iloc[ivalid], y_train.iloc[ivalid], reference = lgb_train)\n",
" clf = lgb.train(params, lgb_train, 1000, valid_sets=[lgb_eval], \n",
" early_stopping_rounds=100, verbose_eval=200)\n",
"\n",
" clfs.append(clf)\n",
" pred = clf.predict(X_test)\n",
" preds += pred/skf.n_splits"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2021-05-04T11:08:45.599328Z",
"iopub.status.busy": "2021-05-04T11:08:45.597803Z",
"iopub.status.idle": "2021-05-04T11:08:46.008833Z",
"shell.execute_reply": "2021-05-04T11:08:46.009377Z"
},
"papermill": {
"duration": 0.595883,
"end_time": "2021-05-04T11:08:46.009566",
"exception": false,
"start_time": "2021-05-04T11:08:45.413683",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"lgb.plot_importance(clf, max_num_features=20);"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2021-05-04T11:08:46.331978Z",
"iopub.status.busy": "2021-05-04T11:08:46.330935Z",
"iopub.status.idle": "2021-05-04T11:08:46.363726Z",
"shell.execute_reply": "2021-05-04T11:08:46.364366Z"
},
"papermill": {
"duration": 0.195863,
"end_time": "2021-05-04T11:08:46.364563",
"exception": false,
"start_time": "2021-05-04T11:08:46.168700",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"import joblib\n",
"for i, clf in enumerate(clfs):\n",
" model_filename = f'{AICROWD_ASSETS_DIR}/model_lgb_fold_{i}.pkl'\n",
" joblib.dump(clf, model_filename)"
]
},
{
"cell_type": "markdown",
"metadata": {
"papermill": {
"duration": 0.158194,
"end_time": "2021-05-04T11:08:46.681169",
"exception": false,
"start_time": "2021-05-04T11:08:46.522975",
"status": "completed"
},
"tags": []
},
"source": [
"## Prediction"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2021-05-04T11:08:47.071316Z",
"iopub.status.busy": "2021-05-04T11:08:47.037267Z",
"iopub.status.idle": "2021-05-04T11:08:47.158382Z",
"shell.execute_reply": "2021-05-04T11:08:47.157836Z"
},
"papermill": {
"duration": 0.31786,
"end_time": "2021-05-04T11:08:47.158529",
"exception": false,
"start_time": "2021-05-04T11:08:46.840669",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"df_test = train.sample(n=100)\n",
" \n",
"for c in numeric_features:\n",
" df_test[c] = df_test[c].astype(float)\n",
" \n",
"for c in cat_cols:\n",
" df_test[c].fillna(\"NA\", inplace=True)\n",
" \n",
"df_test_dummies = pd.get_dummies(df_test[cat_cols], columns=cat_cols, dummy_na=True).add_prefix('CAT_')\n",
"df_test = pd.concat([df_test, df_test_dummies], axis=1)\n",
"df_test['cnt_NaN'] = df_test[numeric_features].isna().sum(axis=1)\n",
"\n",
"df_test.fillna(-1, inplace=True)\n",
"\n",
"for c in dummy_cols:\n",
" if c not in df_test.columns:\n",
" df_test[c] = 0\n",
"\n",
"print(\"Missing columns:\", [c for c in model_features if c not in df_test.columns])\n",
"df_test.head(3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2021-05-04T11:08:47.489364Z",
"iopub.status.busy": "2021-05-04T11:08:47.488466Z",
"iopub.status.idle": "2021-05-04T11:08:47.534274Z",
"shell.execute_reply": "2021-05-04T11:08:47.535228Z"
},
"papermill": {
"duration": 0.215922,
"end_time": "2021-05-04T11:08:47.535430",
"exception": false,
"start_time": "2021-05-04T11:08:47.319508",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"import joblib\n",
"\n",
"X_test = df_test[model_features]\n",
"\n",
"preds = 0.0\n",
"nb_folds = 5 # skf.n_splits\n",
"for fold in range(nb_folds):\n",
" print(\"-\"*40)\n",
" print(f\"Running for fold {fold}\")\n",
" model_filename = f'{AICROWD_ASSETS_DIR}/model_lgb_fold_{i}.pkl'\n",
" \n",
" clf = joblib.load(model_filename)\n",
" pred = clf.predict(X_test)\n",
" preds += pred/nb_folds\n",
" \n",
"print(preds.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2021-05-04T11:08:47.893006Z",
"iopub.status.busy": "2021-05-04T11:08:47.888741Z",
"iopub.status.idle": "2021-05-04T11:08:48.174692Z",
"shell.execute_reply": "2021-05-04T11:08:48.174030Z"
},
"papermill": {
"duration": 0.478602,
"end_time": "2021-05-04T11:08:48.174837",
"exception": false,
"start_time": "2021-05-04T11:08:47.696235",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"print(preds.min(), preds.max())\n",
"for i, (value, color) in enumerate(zip(target_values, colors)):\n",
" sns.distplot(preds[:, i], color=color, norm_hist=True)"
]
},
{
"cell_type": "markdown",
"metadata": {
"papermill": {
"duration": 0.160153,
"end_time": "2021-05-04T11:08:48.495111",
"exception": false,
"start_time": "2021-05-04T11:08:48.334958",
"status": "completed"
},
"tags": []
},
"source": [
"# End"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
},
"papermill": {
"default_parameters": {},
"duration": 103.180567,
"end_time": "2021-05-04T11:08:50.768320",
"environment_variables": {},
"exception": null,
"input_path": "__notebook__.ipynb",
"output_path": "__notebook__.ipynb",
"parameters": {},
"start_time": "2021-05-04T11:07:07.587753",
"version": "2.3.3"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment