Last active
November 25, 2022 20:46
-
-
Save harisonmg/a82e9e725d911ae499c713b249438a80 to your computer and use it in GitHub Desktop.
modelling_tutorial.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/harisonmg/a82e9e725d911ae499c713b249438a80/modelling_tutorial.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "f73506c8", | |
"metadata": { | |
"papermill": { | |
"duration": 0.012187, | |
"end_time": "2022-10-11T08:53:40.074868", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:40.062681", | |
"status": "completed" | |
}, | |
"tags": [], | |
"id": "f73506c8" | |
}, | |
"source": [ | |
"# Introduction\n", | |
"The problem at hand is a binary classification problem where we are required to\n", | |
"predict whether a person will default on a loan, given certain characteristics." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "e1def3a3", | |
"metadata": { | |
"papermill": { | |
"duration": 0.007776, | |
"end_time": "2022-10-11T08:53:40.091433", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:40.083657", | |
"status": "completed" | |
}, | |
"tags": [], | |
"id": "e1def3a3" | |
}, | |
"source": [ | |
"# Config" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# @title Download and extract the data\n", | |
"%%bash\n", | |
"rm -rf input && mkdir -p input/icea-lion-credit-risk\n", | |
"wget --no-check-certificate \"https://docs.google.com/uc?export=download&id=1KM3MnRLpOxUKESUbqFXBmiDoU_cna927\" \\\n", | |
" -O input/icea-lion-credit-risk.zip\n", | |
"unzip input/*.zip -d input/icea-lion-credit-risk" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"cellView": "form", | |
"id": "qk9xWhveThxl", | |
"outputId": "7149600c-67ca-4934-d58f-e24d3b8365c5" | |
}, | |
"id": "qk9xWhveThxl", | |
"execution_count": 1, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Archive: input/icea-lion-credit-risk.zip\n", | |
" inflating: input/icea-lion-credit-risk/test.csv \n", | |
" inflating: input/icea-lion-credit-risk/train.csv \n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stderr", | |
"text": [ | |
"--2022-10-11 09:46:39-- https://docs.google.com/uc?export=download&id=1KM3MnRLpOxUKESUbqFXBmiDoU_cna927\n", | |
"Resolving docs.google.com (docs.google.com)... 172.217.212.139, 172.217.212.113, 172.217.212.138, ...\n", | |
"Connecting to docs.google.com (docs.google.com)|172.217.212.139|:443... connected.\n", | |
"HTTP request sent, awaiting response... 303 See Other\n", | |
"Location: https://doc-04-2s-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/muf0llm271famd2hbjnu7o7oge7gb9qr/1665481575000/13037866814700494090/*/1KM3MnRLpOxUKESUbqFXBmiDoU_cna927?e=download&uuid=5aead628-29c6-4cda-9c00-98f580cedd83 [following]\n", | |
"Warning: wildcards not supported in HTTP.\n", | |
"--2022-10-11 09:46:40-- https://doc-04-2s-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/muf0llm271famd2hbjnu7o7oge7gb9qr/1665481575000/13037866814700494090/*/1KM3MnRLpOxUKESUbqFXBmiDoU_cna927?e=download&uuid=5aead628-29c6-4cda-9c00-98f580cedd83\n", | |
"Resolving doc-04-2s-docs.googleusercontent.com (doc-04-2s-docs.googleusercontent.com)... 74.125.69.132, 2607:f8b0:4001:c08::84\n", | |
"Connecting to doc-04-2s-docs.googleusercontent.com (doc-04-2s-docs.googleusercontent.com)|74.125.69.132|:443... connected.\n", | |
"HTTP request sent, awaiting response... 200 OK\n", | |
"Length: 375610 (367K) [application/zip]\n", | |
"Saving to: ‘input/icea-lion-credit-risk.zip’\n", | |
"\n", | |
" 0K .......... .......... .......... .......... .......... 13% 26.3M 0s\n", | |
" 50K .......... .......... .......... .......... .......... 27% 35.9M 0s\n", | |
" 100K .......... .......... .......... .......... .......... 40% 62.0M 0s\n", | |
" 150K .......... .......... .......... .......... .......... 54% 55.9M 0s\n", | |
" 200K .......... .......... .......... .......... .......... 68% 50.6M 0s\n", | |
" 250K .......... .......... .......... .......... .......... 81% 48.9M 0s\n", | |
" 300K .......... .......... .......... .......... .......... 95% 56.7M 0s\n", | |
" 350K .......... ...... 100% 63.3M=0.008s\n", | |
"\n", | |
"2022-10-11 09:46:40 (45.0 MB/s) - ‘input/icea-lion-credit-risk.zip’ saved [375610/375610]\n", | |
"\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "85576c0f", | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2022-10-11T08:53:40.110392Z", | |
"iopub.status.busy": "2022-10-11T08:53:40.109362Z", | |
"iopub.status.idle": "2022-10-11T08:53:41.193697Z", | |
"shell.execute_reply": "2022-10-11T08:53:41.192520Z" | |
}, | |
"papermill": { | |
"duration": 1.097137, | |
"end_time": "2022-10-11T08:53:41.196667", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:40.099530", | |
"status": "completed" | |
}, | |
"tags": [], | |
"cellView": "form", | |
"id": "85576c0f" | |
}, | |
"outputs": [], | |
"source": [ | |
"# @title Imports\n", | |
"from pathlib import Path\n", | |
"\n", | |
"from sklearn import model_selection" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "47a76ebc", | |
"metadata": { | |
"papermill": { | |
"duration": 0.007942, | |
"end_time": "2022-10-11T08:53:41.213031", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:41.205089", | |
"status": "completed" | |
}, | |
"tags": [], | |
"id": "47a76ebc" | |
}, | |
"source": [ | |
"Important settings:\n", | |
"- `TARGET_COL` - The target column.\n", | |
"- `CV_SPLITTER` - Cross-validation(CV) strategy. Because of class imbalance,\n", | |
"we use stratified k-fold CV with *k=5*. Read more about cross-validation on\n", | |
"the [scikit-learn documentation](https://scikit-learn.org/stable/modules/cross_validation.html)\n", | |
"- `EVAL_METRICS` - Model evaluation metrics. We use ROC AUC due to it's\n", | |
"simplicity and robustness to class imbalance. Read more about evaluation metrics\n", | |
"on the [scikit-learn documentation](https://scikit-learn.org/stable/modules/model_evaluation.html)." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "3a615caa", | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2022-10-11T08:53:41.231651Z", | |
"iopub.status.busy": "2022-10-11T08:53:41.231194Z", | |
"iopub.status.idle": "2022-10-11T08:53:41.240168Z", | |
"shell.execute_reply": "2022-10-11T08:53:41.238838Z" | |
}, | |
"papermill": { | |
"duration": 0.021261, | |
"end_time": "2022-10-11T08:53:41.242641", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:41.221380", | |
"status": "completed" | |
}, | |
"tags": [], | |
"id": "3a615caa" | |
}, | |
"outputs": [], | |
"source": [ | |
"# file paths\n", | |
"DATA_DIR = Path(\"./input/icea-lion-credit-risk\")\n", | |
"\n", | |
"OUTPUT_DIR = Path(\"./output\")\n", | |
"\n", | |
"# data\n", | |
"TRAIN_DATA = DATA_DIR / \"train.csv\"\n", | |
"\n", | |
"TEST_DATA = DATA_DIR / \"test.csv\"\n", | |
"\n", | |
"# data types\n", | |
"NUMERIC_DTYPES = [\"int64\", \"float64\"]\n", | |
"\n", | |
"# columns in the data\n", | |
"INDEX_COL = None\n", | |
"\n", | |
"TARGET_COL = \"loan_status\"\n", | |
"\n", | |
"NUMERIC_FEATURES = (\n", | |
" # \"person_age\",\n", | |
" \"person_income\",\n", | |
" \"person_emp_length\",\n", | |
" \"loan_amnt\",\n", | |
" \"loan_int_rate\",\n", | |
" \"loan_percent_income\",\n", | |
" \"cb_person_cred_hist_length\",\n", | |
")\n", | |
"\n", | |
"CATEGORICAL_FEATURES = ()\n", | |
"\n", | |
"# random seed for reproducible results\n", | |
"RANDOM_SEED = 100\n", | |
"\n", | |
"# cross validation\n", | |
"NUM_FOLDS = 5\n", | |
"\n", | |
"CV_SPLITTER = model_selection.StratifiedKFold(\n", | |
" n_splits=NUM_FOLDS, shuffle=True, random_state=RANDOM_SEED\n", | |
")\n", | |
"\n", | |
"# metrics\n", | |
"EVAL_METRICS = (\"roc_auc\",)\n", | |
"\n", | |
"# parallel jobs\n", | |
"N_JOBS = -1\n", | |
"\n", | |
"# logging\n", | |
"VERBOSITY = 0\n", | |
"\n", | |
"VERBOSE = VERBOSITY > 1" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "68e5bba6", | |
"metadata": { | |
"papermill": { | |
"duration": 0.008982, | |
"end_time": "2022-10-11T08:53:41.259663", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:41.250681", | |
"status": "completed" | |
}, | |
"tags": [], | |
"id": "68e5bba6" | |
}, | |
"source": [ | |
"# Load the data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "0cc39f92", | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2022-10-11T08:53:41.277299Z", | |
"iopub.status.busy": "2022-10-11T08:53:41.276862Z", | |
"iopub.status.idle": "2022-10-11T08:53:41.282177Z", | |
"shell.execute_reply": "2022-10-11T08:53:41.281018Z" | |
}, | |
"papermill": { | |
"duration": 0.017063, | |
"end_time": "2022-10-11T08:53:41.284559", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:41.267496", | |
"status": "completed" | |
}, | |
"tags": [], | |
"id": "0cc39f92" | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "19731c64", | |
"metadata": { | |
"_kg_hide-input": false, | |
"execution": { | |
"iopub.execute_input": "2022-10-11T08:53:41.302226Z", | |
"iopub.status.busy": "2022-10-11T08:53:41.301798Z", | |
"iopub.status.idle": "2022-10-11T08:53:41.423729Z", | |
"shell.execute_reply": "2022-10-11T08:53:41.422475Z" | |
}, | |
"papermill": { | |
"duration": 0.134256, | |
"end_time": "2022-10-11T08:53:41.426653", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:41.292397", | |
"status": "completed" | |
}, | |
"tags": [], | |
"id": "19731c64", | |
"outputId": "c205ce21-9956-4e8b-a5df-358f3314d775", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
} | |
}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"RangeIndex: 22850 entries, 0 to 22849\n", | |
"Data columns (total 12 columns):\n", | |
" # Column Non-Null Count Dtype \n", | |
"--- ------ -------------- ----- \n", | |
" 0 person_age 22850 non-null int64 \n", | |
" 1 person_income 22850 non-null int64 \n", | |
" 2 person_home_ownership 22850 non-null object \n", | |
" 3 person_emp_length 22237 non-null float64\n", | |
" 4 loan_intent 22850 non-null object \n", | |
" 5 loan_grade 22850 non-null object \n", | |
" 6 loan_amnt 22850 non-null int64 \n", | |
" 7 loan_int_rate 20703 non-null float64\n", | |
" 8 loan_status 22850 non-null int64 \n", | |
" 9 loan_percent_income 22850 non-null float64\n", | |
" 10 cb_person_default_on_file 22850 non-null object \n", | |
" 11 cb_person_cred_hist_length 22850 non-null int64 \n", | |
"dtypes: float64(3), int64(5), object(4)\n", | |
"memory usage: 2.1+ MB\n" | |
] | |
} | |
], | |
"source": [ | |
"train_df = pd.read_csv(TRAIN_DATA, index_col=INDEX_COL)\n", | |
"train_df.info()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "f7a5b2aa", | |
"metadata": { | |
"_kg_hide-input": false, | |
"execution": { | |
"iopub.execute_input": "2022-10-11T08:53:41.444700Z", | |
"iopub.status.busy": "2022-10-11T08:53:41.444261Z", | |
"iopub.status.idle": "2022-10-11T08:53:41.484106Z", | |
"shell.execute_reply": "2022-10-11T08:53:41.483211Z" | |
}, | |
"papermill": { | |
"duration": 0.05237, | |
"end_time": "2022-10-11T08:53:41.487258", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:41.434888", | |
"status": "completed" | |
}, | |
"tags": [], | |
"id": "f7a5b2aa", | |
"outputId": "60104c0a-55f2-4efb-e35d-8c0c149f9ffb", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
} | |
}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"RangeIndex: 9731 entries, 0 to 9730\n", | |
"Data columns (total 12 columns):\n", | |
" # Column Non-Null Count Dtype \n", | |
"--- ------ -------------- ----- \n", | |
" 0 person_age 9731 non-null int64 \n", | |
" 1 person_income 9731 non-null int64 \n", | |
" 2 person_home_ownership 9731 non-null object \n", | |
" 3 person_emp_length 9449 non-null float64\n", | |
" 4 loan_intent 9731 non-null object \n", | |
" 5 loan_grade 9731 non-null object \n", | |
" 6 loan_amnt 9731 non-null int64 \n", | |
" 7 loan_int_rate 8762 non-null float64\n", | |
" 8 loan_percent_income 9731 non-null float64\n", | |
" 9 cb_person_default_on_file 9731 non-null object \n", | |
" 10 cb_person_cred_hist_length 9731 non-null int64 \n", | |
" 11 loan_status 0 non-null float64\n", | |
"dtypes: float64(4), int64(4), object(4)\n", | |
"memory usage: 912.4+ KB\n" | |
] | |
} | |
], | |
"source": [ | |
"test_df = pd.read_csv(TEST_DATA, index_col=INDEX_COL)\n", | |
"test_df.info()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "6cc7ca5d", | |
"metadata": { | |
"papermill": { | |
"duration": 0.009017, | |
"end_time": "2022-10-11T08:53:41.505389", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:41.496372", | |
"status": "completed" | |
}, | |
"tags": [], | |
"id": "6cc7ca5d" | |
}, | |
"source": [ | |
"# Define data preprocessing pipelines\n", | |
"The data we have needs to be transformed for various reasons such as:\n", | |
"1. To make it amenable to machine learning algorithms\n", | |
"2. To remove correlated features\n", | |
"\n", | |
"We use scikit-learn pipeline builders such as `Pipeline` and `ColumnTransformer` to\n", | |
"preprocess the data elegantly while avoiding errors and [data leakage](https://machinelearningmastery.com/data-leakage-machine-learning/).\n", | |
"\n", | |
"### Linear model pipelines\n", | |
"- Numeric features\n", | |
" 1. Standardize(subtract the mean and divide by standard deviation) using\n", | |
" [`StandardScaler`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)\n", | |
" 1. Impute missing values with median using [`SimpleImputer`](https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html)\n", | |
"- Categorical features\n", | |
" 1. One-hot encode using [`OneHotEncoder`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html)\n", | |
"\n", | |
"### Tree-based model pipelines\n", | |
"- Numeric features\n", | |
" 1. Impute missing values with median using [`SimpleImputer`](https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html)\n", | |
"- Categorical features\n", | |
" 1. Ordinal encode using [`OrdinalEncoder`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "e661f059", | |
"metadata": { | |
"_kg_hide-input": true, | |
"execution": { | |
"iopub.execute_input": "2022-10-11T08:53:41.524337Z", | |
"iopub.status.busy": "2022-10-11T08:53:41.523685Z", | |
"iopub.status.idle": "2022-10-11T08:53:41.658317Z", | |
"shell.execute_reply": "2022-10-11T08:53:41.657018Z" | |
}, | |
"papermill": { | |
"duration": 0.147666, | |
"end_time": "2022-10-11T08:53:41.661335", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:41.513669", | |
"status": "completed" | |
}, | |
"tags": [], | |
"id": "e661f059" | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn import compose, impute, pipeline, preprocessing" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "f9ef89c4", | |
"metadata": { | |
"_kg_hide-input": true, | |
"execution": { | |
"iopub.execute_input": "2022-10-11T08:53:41.679412Z", | |
"iopub.status.busy": "2022-10-11T08:53:41.678929Z", | |
"iopub.status.idle": "2022-10-11T08:53:41.687225Z", | |
"shell.execute_reply": "2022-10-11T08:53:41.685898Z" | |
}, | |
"papermill": { | |
"duration": 0.020359, | |
"end_time": "2022-10-11T08:53:41.689795", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:41.669436", | |
"status": "completed" | |
}, | |
"tags": [], | |
"cellView": "form", | |
"id": "f9ef89c4" | |
}, | |
"outputs": [], | |
"source": [ | |
"# @title Helper functions\n", | |
"def get_numeric_cols(model_type):\n", | |
" if NUMERIC_FEATURES:\n", | |
" # remove correlated features: use ratio for linear models and original\n", | |
" # features for tree-based models\n", | |
" if model_type == \"linear\":\n", | |
" remove_features = (\"person_income\", \"loan_amnt\")\n", | |
" elif model_type == \"tree\":\n", | |
" remove_features = (\"loan_percent_income\",)\n", | |
" else:\n", | |
" raise ValueError(f\"Invalid model type: {model_type!r}\")\n", | |
" return tuple(\n", | |
" feat for feat in NUMERIC_FEATURES if feat not in remove_features\n", | |
" )\n", | |
" return compose.make_column_selector(dtype_include=NUMERIC_DTYPES)\n", | |
"\n", | |
"\n", | |
"def get_categorical_cols():\n", | |
" if CATEGORICAL_FEATURES:\n", | |
" return CATEGORICAL_FEATURES\n", | |
" return compose.make_column_selector(dtype_exclude=NUMERIC_DTYPES)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "95a183f6", | |
"metadata": { | |
"_kg_hide-input": true, | |
"execution": { | |
"iopub.execute_input": "2022-10-11T08:53:41.708473Z", | |
"iopub.status.busy": "2022-10-11T08:53:41.708052Z", | |
"iopub.status.idle": "2022-10-11T08:53:41.723620Z", | |
"shell.execute_reply": "2022-10-11T08:53:41.722318Z" | |
}, | |
"papermill": { | |
"duration": 0.027855, | |
"end_time": "2022-10-11T08:53:41.726067", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:41.698212", | |
"status": "completed" | |
}, | |
"tags": [], | |
"cellView": "form", | |
"id": "95a183f6" | |
}, | |
"outputs": [], | |
"source": [ | |
"#@title Preprocessing pipelines\n", | |
"imputers = {\n", | |
" \"constant\": impute.SimpleImputer(strategy=\"constant\", fill_value=\"unknown\"),\n", | |
" \"knn\": impute.KNNImputer(),\n", | |
" \"mean\": impute.SimpleImputer(),\n", | |
" \"median\": impute.SimpleImputer(strategy=\"median\"),\n", | |
" \"mode\": impute.SimpleImputer(strategy=\"most_frequent\"),\n", | |
"}\n", | |
"\n", | |
"encoders = {\n", | |
" \"one_hot\": preprocessing.OneHotEncoder(handle_unknown=\"ignore\"),\n", | |
" \"ordinal\": preprocessing.OrdinalEncoder(\n", | |
" handle_unknown=\"use_encoded_value\", unknown_value=-999\n", | |
" ),\n", | |
"}\n", | |
"\n", | |
"scalers = {\n", | |
" \"max_abs\": preprocessing.MaxAbsScaler(),\n", | |
" \"min_max\": preprocessing.MinMaxScaler(),\n", | |
" \"standard\": preprocessing.StandardScaler(),\n", | |
"}\n", | |
"\n", | |
"numeric_pipelines = {\n", | |
" \"linear\": pipeline.Pipeline(\n", | |
" [\n", | |
" (\"standard_scaler\", scalers[\"standard\"]),\n", | |
" (\"median_imputer\", imputers[\"median\"]),\n", | |
" ],\n", | |
" verbose=VERBOSITY,\n", | |
" ),\n", | |
" \"tree\": pipeline.Pipeline(\n", | |
" [(\"median_imputer\", imputers[\"median\"])], verbose=VERBOSITY\n", | |
" ),\n", | |
"}\n", | |
"\n", | |
"categorical_pipelines = {\n", | |
" \"linear\": pipeline.Pipeline(\n", | |
" [(\"one_hot_encoder\", encoders[\"one_hot\"])], verbose=VERBOSITY\n", | |
" ),\n", | |
" \"tree\": pipeline.Pipeline(\n", | |
" [(\"ordinal_encoder\", encoders[\"ordinal\"])],\n", | |
" verbose=VERBOSITY,\n", | |
" ),\n", | |
"}\n", | |
"\n", | |
"preprocessors = {\n", | |
" \"lin_num\": compose.make_column_transformer(\n", | |
" (numeric_pipelines[\"linear\"], get_numeric_cols(\"linear\")),\n", | |
" n_jobs=N_JOBS,\n", | |
" verbose=VERBOSE,\n", | |
" ),\n", | |
" \"tree_num\": compose.make_column_transformer(\n", | |
" (numeric_pipelines[\"tree\"], get_numeric_cols(\"tree\")),\n", | |
" n_jobs=N_JOBS,\n", | |
" verbose=VERBOSE,\n", | |
" ),\n", | |
" \"lin_cat\": compose.make_column_transformer(\n", | |
" (categorical_pipelines[\"linear\"], get_categorical_cols()),\n", | |
" n_jobs=N_JOBS,\n", | |
" verbose=VERBOSE,\n", | |
" ),\n", | |
" \"tree_cat\": compose.make_column_transformer(\n", | |
" (categorical_pipelines[\"tree\"], get_categorical_cols()),\n", | |
" n_jobs=N_JOBS,\n", | |
" verbose=VERBOSE,\n", | |
" ),\n", | |
" \"lin_all\": compose.make_column_transformer(\n", | |
" (numeric_pipelines[\"linear\"], get_numeric_cols(\"linear\")),\n", | |
" (categorical_pipelines[\"linear\"], get_categorical_cols()),\n", | |
" n_jobs=N_JOBS,\n", | |
" verbose=VERBOSE,\n", | |
" ),\n", | |
" \"tree_all\": compose.make_column_transformer(\n", | |
" (numeric_pipelines[\"tree\"], get_numeric_cols(\"tree\")),\n", | |
" (categorical_pipelines[\"tree\"], get_categorical_cols()),\n", | |
" n_jobs=N_JOBS,\n", | |
" verbose=VERBOSE,\n", | |
" ),\n", | |
"}" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "ae13aea5", | |
"metadata": { | |
"papermill": { | |
"duration": 0.007878, | |
"end_time": "2022-10-11T08:53:41.742002", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:41.734124", | |
"status": "completed" | |
}, | |
"tags": [], | |
"id": "ae13aea5" | |
}, | |
"source": [ | |
"# Define the models" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# @title Install CatBoost\n", | |
"import sys\n", | |
"\n", | |
"try:\n", | |
" import catboost\n", | |
"except ImportError:\n", | |
" !{sys.executable} -m pip install -q catboost" | |
], | |
"metadata": { | |
"cellView": "form", | |
"id": "MVoTDjtPaz5r" | |
}, | |
"id": "MVoTDjtPaz5r", | |
"execution_count": 10, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "5f263c31", | |
"metadata": { | |
"_kg_hide-input": true, | |
"execution": { | |
"iopub.execute_input": "2022-10-11T08:53:41.759556Z", | |
"iopub.status.busy": "2022-10-11T08:53:41.759121Z", | |
"iopub.status.idle": "2022-10-11T08:53:43.179938Z", | |
"shell.execute_reply": "2022-10-11T08:53:43.178586Z" | |
}, | |
"papermill": { | |
"duration": 1.43263, | |
"end_time": "2022-10-11T08:53:43.182544", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:41.749914", | |
"status": "completed" | |
}, | |
"tags": [], | |
"id": "5f263c31" | |
}, | |
"outputs": [], | |
"source": [ | |
"import catboost\n", | |
"import lightgbm\n", | |
"import xgboost\n", | |
"from sklearn import dummy, ensemble, linear_model, tree" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "526ffcd1", | |
"metadata": { | |
"_kg_hide-input": false, | |
"execution": { | |
"iopub.execute_input": "2022-10-11T08:53:43.201140Z", | |
"iopub.status.busy": "2022-10-11T08:53:43.200730Z", | |
"iopub.status.idle": "2022-10-11T08:53:43.213154Z", | |
"shell.execute_reply": "2022-10-11T08:53:43.211924Z" | |
}, | |
"papermill": { | |
"duration": 0.024651, | |
"end_time": "2022-10-11T08:53:43.215751", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:43.191100", | |
"status": "completed" | |
}, | |
"tags": [], | |
"id": "526ffcd1" | |
}, | |
"outputs": [], | |
"source": [ | |
"models = {\n", | |
" \"dc\": dummy.DummyClassifier(),\n", | |
" \"lr\": linear_model.LogisticRegression(\n", | |
" class_weight=\"balanced\", random_state=RANDOM_SEED\n", | |
" ),\n", | |
" \"dt\": tree.DecisionTreeClassifier(random_state=RANDOM_SEED),\n", | |
" \"rf\": ensemble.RandomForestClassifier(\n", | |
" n_jobs=N_JOBS, random_state=RANDOM_SEED, verbose=VERBOSITY\n", | |
" ),\n", | |
" \"xgb\": xgboost.XGBClassifier(\n", | |
" n_jobs=N_JOBS, random_state=RANDOM_SEED, verbosity=VERBOSITY\n", | |
" ),\n", | |
" \"cb\": catboost.CatBoostClassifier(\n", | |
" auto_class_weights=\"Balanced\", random_state=RANDOM_SEED, verbose=VERBOSITY\n", | |
" ),\n", | |
" \"lgb\": lightgbm.LGBMClassifier(\n", | |
" class_weight=\"balanced\",\n", | |
" n_jobs=N_JOBS,\n", | |
" random_state=RANDOM_SEED,\n", | |
" verbose=VERBOSITY,\n", | |
" ),\n", | |
" \"hgb\": ensemble.HistGradientBoostingClassifier(\n", | |
" random_state=RANDOM_SEED, verbose=VERBOSITY\n", | |
" ),\n", | |
"}" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "356d4d01", | |
"metadata": { | |
"papermill": { | |
"duration": 0.007997, | |
"end_time": "2022-10-11T08:53:43.232163", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:43.224166", | |
"status": "completed" | |
}, | |
"tags": [], | |
"id": "356d4d01" | |
}, | |
"source": [ | |
"# Train the models" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "88d6f611", | |
"metadata": { | |
"_kg_hide-input": true, | |
"execution": { | |
"iopub.execute_input": "2022-10-11T08:53:43.250540Z", | |
"iopub.status.busy": "2022-10-11T08:53:43.250118Z", | |
"iopub.status.idle": "2022-10-11T08:53:43.256723Z", | |
"shell.execute_reply": "2022-10-11T08:53:43.255379Z" | |
}, | |
"papermill": { | |
"duration": 0.019216, | |
"end_time": "2022-10-11T08:53:43.259573", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:43.240357", | |
"status": "completed" | |
}, | |
"tags": [], | |
"cellView": "form", | |
"id": "88d6f611" | |
}, | |
"outputs": [], | |
"source": [ | |
"# @title Imports\n", | |
"import shutil\n", | |
"import uuid\n", | |
"\n", | |
"from sklearn.model_selection import cross_validate\n", | |
"from sklearn import set_config\n", | |
"import joblib\n", | |
"\n", | |
"set_config(display=\"diagram\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "46047e8e", | |
"metadata": { | |
"_kg_hide-input": true, | |
"execution": { | |
"iopub.execute_input": "2022-10-11T08:53:43.278386Z", | |
"iopub.status.busy": "2022-10-11T08:53:43.277951Z", | |
"iopub.status.idle": "2022-10-11T08:53:43.293053Z", | |
"shell.execute_reply": "2022-10-11T08:53:43.291675Z" | |
}, | |
"papermill": { | |
"duration": 0.027947, | |
"end_time": "2022-10-11T08:53:43.295970", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:43.268023", | |
"status": "completed" | |
}, | |
"tags": [], | |
"cellView": "form", | |
"id": "46047e8e" | |
}, | |
"outputs": [], | |
"source": [ | |
"# @title Helper functions\n", | |
"def summarize_metrics(metrics: dict) -> pd.DataFrame:\n", | |
" \"\"\"Summarize metrics from cross-validation.\"\"\"\n", | |
" metrics_df = pd.DataFrame(metrics)\n", | |
"\n", | |
" # convert negated metrics to positive\n", | |
" negative_metrics = metrics_df.columns.str.contains(\"neg_\")\n", | |
" metrics_df.loc[:, negative_metrics] = -metrics_df.loc[:, negative_metrics]\n", | |
"\n", | |
" # rename columns\n", | |
" metrics_df.columns = metrics_df.columns.str.replace(\"neg_\", \"\")\n", | |
"\n", | |
" # obtain mean and std\n", | |
" summary = metrics_df.describe().round(5)\n", | |
" return summary.loc[[\"mean\", \"std\"]]\n", | |
"\n", | |
"\n", | |
"def log_metrics(metrics: dict, run_id: str) -> None:\n", | |
" \"\"\"Log cross-validation metrics.\"\"\"\n", | |
" # summarize metrics\n", | |
" summary = summarize_metrics(metrics)\n", | |
" \n", | |
" # display metrics\n", | |
" print(f\"Cross validation results for run {run_id!r}:\") \n", | |
" display(summary)\n", | |
"\n", | |
"\n", | |
"def save_models(models: list, run_id: str) -> None:\n", | |
" \"\"\"Save models.\"\"\" \n", | |
" # directory for saving models\n", | |
" models_dir = OUTPUT_DIR / run_id\n", | |
" \n", | |
" # delete the directory if it exists\n", | |
" if models_dir.exists():\n", | |
" shutil.rmtree(models_dir)\n", | |
" \n", | |
" # create the directory for saving models\n", | |
" models_dir.mkdir(parents=True)\n", | |
" \n", | |
" # save models for this run\n", | |
" for fold, model in enumerate(models):\n", | |
" joblib.dump(model, filename=models_dir / f\"model_{fold}\")\n", | |
"\n", | |
"def train(model: str, preprocessor: str) -> str:\n", | |
" \"\"\"Train model.\"\"\"\n", | |
" # load data\n", | |
" train_df = pd.read_csv(TRAIN_DATA, index_col=INDEX_COL)\n", | |
"\n", | |
" # drop duplicates\n", | |
" train_df.drop_duplicates(inplace=True)\n", | |
"\n", | |
" # separate features from target\n", | |
" X = train_df.drop(TARGET_COL, axis=1)\n", | |
" y = train_df[TARGET_COL]\n", | |
"\n", | |
" # create pipeline\n", | |
" pipe = pipeline.Pipeline(\n", | |
" [\n", | |
" (preprocessor, preprocessors[preprocessor]),\n", | |
" (model, models[model]),\n", | |
" ],\n", | |
" verbose=VERBOSE,\n", | |
" )\n", | |
" \n", | |
" display(pipe)\n", | |
"\n", | |
" # cross validation\n", | |
" cv_results = cross_validate(\n", | |
" pipe,\n", | |
" X,\n", | |
" y,\n", | |
" scoring=EVAL_METRICS,\n", | |
" cv=CV_SPLITTER,\n", | |
" n_jobs=N_JOBS,\n", | |
" verbose=VERBOSITY,\n", | |
" return_estimator=True,\n", | |
" return_train_score=True,\n", | |
" )\n", | |
" estimators = cv_results.pop(\"estimator\")\n", | |
" \n", | |
" # generate a run id\n", | |
" run_id = str(uuid.uuid4()).replace(\"-\",\"\")\n", | |
" \n", | |
" # log metrics\n", | |
" log_metrics(cv_results, run_id)\n", | |
" \n", | |
" # save the models\n", | |
" save_models(estimators, run_id)\n", | |
" \n", | |
" # return the run id\n", | |
" return run_id" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "bfb8f349", | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2022-10-11T08:53:43.314733Z", | |
"iopub.status.busy": "2022-10-11T08:53:43.314278Z", | |
"iopub.status.idle": "2022-10-11T08:53:43.320230Z", | |
"shell.execute_reply": "2022-10-11T08:53:43.319018Z" | |
}, | |
"papermill": { | |
"duration": 0.018776, | |
"end_time": "2022-10-11T08:53:43.323225", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:43.304449", | |
"status": "completed" | |
}, | |
"tags": [], | |
"id": "bfb8f349", | |
"outputId": "5246acef-903f-4329-c1bd-637f3788736d", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
} | |
}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Models available: ['dc', 'lr', 'dt', 'rf', 'xgb', 'cb', 'lgb', 'hgb']\n", | |
"Preprocessors available: ['lin_num', 'tree_num', 'lin_cat', 'tree_cat', 'lin_all', 'tree_all']\n" | |
] | |
} | |
], | |
"source": [ | |
"# list all models and pipelines\n", | |
"print(f\"Models available: {list(models.keys())}\")\n", | |
"print(f\"Preprocessors available: {list(preprocessors.keys())}\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "45cd7126", | |
"metadata": { | |
"papermill": { | |
"duration": 0.007909, | |
"end_time": "2022-10-11T08:53:43.340155", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:43.332246", | |
"status": "completed" | |
}, | |
"tags": [], | |
"id": "45cd7126" | |
}, | |
"source": [ | |
"## Dummy classifier\n", | |
"This model ignores the input features. We use it as the baseline model. Read more about it on the [scikit-learn documentation](https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html)." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"id": "180ec199", | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2022-10-11T08:53:43.358515Z", | |
"iopub.status.busy": "2022-10-11T08:53:43.358111Z", | |
"iopub.status.idle": "2022-10-11T08:53:45.884150Z", | |
"shell.execute_reply": "2022-10-11T08:53:45.882714Z" | |
}, | |
"papermill": { | |
"duration": 2.539245, | |
"end_time": "2022-10-11T08:53:45.887728", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:43.348483", | |
"status": "completed" | |
}, | |
"tags": [], | |
"id": "180ec199", | |
"outputId": "326dc560-5ef1-49c5-a310-e8092daf91be", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 336 | |
} | |
}, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/plain": [ | |
"Pipeline(steps=[('lin_num',\n", | |
" ColumnTransformer(n_jobs=-1,\n", | |
" transformers=[('pipeline',\n", | |
" Pipeline(steps=[('standard_scaler',\n", | |
" StandardScaler()),\n", | |
" ('median_imputer',\n", | |
" SimpleImputer(strategy='median'))],\n", | |
" verbose=0),\n", | |
" ('person_emp_length',\n", | |
" 'loan_int_rate',\n", | |
" 'loan_percent_income',\n", | |
" 'cb_person_cred_hist_length'))])),\n", | |
" ('dc', DummyClassifier())])" | |
], | |
"text/html": [ | |
"<style>#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 {color: black;background-color: white;}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 pre{padding: 0;}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 div.sk-toggleable {background-color: white;}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 div.sk-estimator:hover {background-color: #d4ebff;}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 2em;bottom: 0;left: 50%;}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 div.sk-item {z-index: 1;}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 div.sk-parallel::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 2em;bottom: 0;left: 50%;}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 div.sk-parallel-item {display: flex;flex-direction: column;position: relative;background-color: white;}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 div.sk-parallel-item:only-child::after {width: 0;}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;position: relative;}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 div.sk-label label {font-family: monospace;font-weight: bold;background-color: white;display: inline-block;line-height: 1.2em;}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 div.sk-label-container {position: relative;z-index: 2;text-align: center;}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-2790ea9e-68c8-43d0-92ed-409f8aad24b6\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>Pipeline(steps=[('lin_num',\n", | |
" ColumnTransformer(n_jobs=-1,\n", | |
" transformers=[('pipeline',\n", | |
" Pipeline(steps=[('standard_scaler',\n", | |
" StandardScaler()),\n", | |
" ('median_imputer',\n", | |
" SimpleImputer(strategy='median'))],\n", | |
" verbose=0),\n", | |
" ('person_emp_length',\n", | |
" 'loan_int_rate',\n", | |
" 'loan_percent_income',\n", | |
" 'cb_person_cred_hist_length'))])),\n", | |
" ('dc', DummyClassifier())])</pre><b>Please rerun this cell to show the HTML repr or trust the notebook.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"8f452748-e784-4a44-b266-0accfa225c7e\" type=\"checkbox\" ><label for=\"8f452748-e784-4a44-b266-0accfa225c7e\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">Pipeline</label><div class=\"sk-toggleable__content\"><pre>Pipeline(steps=[('lin_num',\n", | |
" ColumnTransformer(n_jobs=-1,\n", | |
" transformers=[('pipeline',\n", | |
" Pipeline(steps=[('standard_scaler',\n", | |
" StandardScaler()),\n", | |
" ('median_imputer',\n", | |
" SimpleImputer(strategy='median'))],\n", | |
" verbose=0),\n", | |
" ('person_emp_length',\n", | |
" 'loan_int_rate',\n", | |
" 'loan_percent_income',\n", | |
" 'cb_person_cred_hist_length'))])),\n", | |
" ('dc', DummyClassifier())])</pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"e9f77c77-0c14-46c3-87ee-e657e2a7850e\" type=\"checkbox\" ><label for=\"e9f77c77-0c14-46c3-87ee-e657e2a7850e\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">lin_num: ColumnTransformer</label><div class=\"sk-toggleable__content\"><pre>ColumnTransformer(n_jobs=-1,\n", | |
" transformers=[('pipeline',\n", | |
" Pipeline(steps=[('standard_scaler',\n", | |
" StandardScaler()),\n", | |
" ('median_imputer',\n", | |
" SimpleImputer(strategy='median'))],\n", | |
" verbose=0),\n", | |
" ('person_emp_length', 'loan_int_rate',\n", | |
" 'loan_percent_income',\n", | |
" 'cb_person_cred_hist_length'))])</pre></div></div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"19c8efc3-347a-4aca-892b-50946dad6ffe\" type=\"checkbox\" ><label for=\"19c8efc3-347a-4aca-892b-50946dad6ffe\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">pipeline</label><div class=\"sk-toggleable__content\"><pre>('person_emp_length', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length')</pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"36c98346-c95a-44ea-ba66-79b27fffc1b4\" type=\"checkbox\" ><label for=\"36c98346-c95a-44ea-ba66-79b27fffc1b4\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">StandardScaler</label><div class=\"sk-toggleable__content\"><pre>StandardScaler()</pre></div></div></div><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"90733f5a-780f-4ff9-89dd-8c1af19c8bc9\" type=\"checkbox\" ><label for=\"90733f5a-780f-4ff9-89dd-8c1af19c8bc9\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">SimpleImputer</label><div class=\"sk-toggleable__content\"><pre>SimpleImputer(strategy='median')</pre></div></div></div></div></div></div></div></div></div></div><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"4585ae05-1564-4fac-9bd2-ec40ba67c6e9\" type=\"checkbox\" ><label for=\"4585ae05-1564-4fac-9bd2-ec40ba67c6e9\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">DummyClassifier</label><div class=\"sk-toggleable__content\"><pre>DummyClassifier()</pre></div></div></div></div></div></div></div>" | |
] | |
}, | |
"metadata": {} | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Cross validation results for run 'b1e0789317fe4247aaa7d5f9846b4d8e':\n" | |
] | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/plain": [ | |
" fit_time score_time test_roc_auc train_roc_auc\n", | |
"mean 0.11345 0.10650 0.5 0.5\n", | |
"std 0.00482 0.00167 0.0 0.0" | |
], | |
"text/html": [ | |
"\n", | |
" <div id=\"df-31f09d4d-2348-4a02-b99b-d1edcff82799\">\n", | |
" <div class=\"colab-df-container\">\n", | |
" <div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>fit_time</th>\n", | |
" <th>score_time</th>\n", | |
" <th>test_roc_auc</th>\n", | |
" <th>train_roc_auc</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>mean</th>\n", | |
" <td>0.11345</td>\n", | |
" <td>0.10650</td>\n", | |
" <td>0.5</td>\n", | |
" <td>0.5</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>std</th>\n", | |
" <td>0.00482</td>\n", | |
" <td>0.00167</td>\n", | |
" <td>0.0</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>\n", | |
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-31f09d4d-2348-4a02-b99b-d1edcff82799')\"\n", | |
" title=\"Convert this dataframe to an interactive table.\"\n", | |
" style=\"display:none;\">\n", | |
" \n", | |
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", | |
" width=\"24px\">\n", | |
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n", | |
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n", | |
" </svg>\n", | |
" </button>\n", | |
" \n", | |
" <style>\n", | |
" .colab-df-container {\n", | |
" display:flex;\n", | |
" flex-wrap:wrap;\n", | |
" gap: 12px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert {\n", | |
" background-color: #E8F0FE;\n", | |
" border: none;\n", | |
" border-radius: 50%;\n", | |
" cursor: pointer;\n", | |
" display: none;\n", | |
" fill: #1967D2;\n", | |
" height: 32px;\n", | |
" padding: 0 0 0 0;\n", | |
" width: 32px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert:hover {\n", | |
" background-color: #E2EBFA;\n", | |
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
" fill: #174EA6;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert {\n", | |
" background-color: #3B4455;\n", | |
" fill: #D2E3FC;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert:hover {\n", | |
" background-color: #434B5C;\n", | |
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", | |
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", | |
" fill: #FFFFFF;\n", | |
" }\n", | |
" </style>\n", | |
"\n", | |
" <script>\n", | |
" const buttonEl =\n", | |
" document.querySelector('#df-31f09d4d-2348-4a02-b99b-d1edcff82799 button.colab-df-convert');\n", | |
" buttonEl.style.display =\n", | |
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
"\n", | |
" async function convertToInteractive(key) {\n", | |
" const element = document.querySelector('#df-31f09d4d-2348-4a02-b99b-d1edcff82799');\n", | |
" const dataTable =\n", | |
" await google.colab.kernel.invokeFunction('convertToInteractive',\n", | |
" [key], {});\n", | |
" if (!dataTable) return;\n", | |
"\n", | |
" const docLinkHtml = 'Like what you see? Visit the ' +\n", | |
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", | |
" + ' to learn more about interactive tables.';\n", | |
" element.innerHTML = '';\n", | |
" dataTable['output_type'] = 'display_data';\n", | |
" await google.colab.output.renderOutput(dataTable, element);\n", | |
" const docLink = document.createElement('div');\n", | |
" docLink.innerHTML = docLinkHtml;\n", | |
" element.appendChild(docLink);\n", | |
" }\n", | |
" </script>\n", | |
" </div>\n", | |
" </div>\n", | |
" " | |
] | |
}, | |
"metadata": {} | |
} | |
], | |
"source": [ | |
"dc_run_id = train(\"dc\", \"lin_num\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "be49b6b7", | |
"metadata": { | |
"papermill": { | |
"duration": 0.009202, | |
"end_time": "2022-10-11T08:53:45.910688", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:45.901486", | |
"status": "completed" | |
}, | |
"tags": [], | |
"id": "be49b6b7" | |
}, | |
"source": [ | |
"## Logistic Regression\n", | |
"This is a linear model. Read more about it on the [scikit-learn documentation](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html).\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"id": "f9d7ed3c", | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2022-10-11T08:53:45.930506Z", | |
"iopub.status.busy": "2022-10-11T08:53:45.930015Z", | |
"iopub.status.idle": "2022-10-11T08:53:47.334800Z", | |
"shell.execute_reply": "2022-10-11T08:53:47.333488Z" | |
}, | |
"papermill": { | |
"duration": 1.418111, | |
"end_time": "2022-10-11T08:53:47.337679", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:45.919568", | |
"status": "completed" | |
}, | |
"tags": [], | |
"id": "f9d7ed3c", | |
"outputId": "ef5ec49f-711b-4c50-8f0b-e5acba36896c", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 336 | |
} | |
}, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/plain": [ | |
"Pipeline(steps=[('lin_all',\n", | |
" ColumnTransformer(n_jobs=-1,\n", | |
" transformers=[('pipeline-1',\n", | |
" Pipeline(steps=[('standard_scaler',\n", | |
" StandardScaler()),\n", | |
" ('median_imputer',\n", | |
" SimpleImputer(strategy='median'))],\n", | |
" verbose=0),\n", | |
" ('person_emp_length',\n", | |
" 'loan_int_rate',\n", | |
" 'loan_percent_income',\n", | |
" 'cb_person_cred_hist_length')),\n", | |
" ('pipeline-2',\n", | |
" Pipeline(steps=[('one_hot_encoder',\n", | |
" OneHotEncoder(handle_unknown='ignore'))],\n", | |
" verbose=0),\n", | |
" <sklearn.compose._column_transformer.make_column_selector object at 0x7fa725c04710>)])),\n", | |
" ('lr',\n", | |
" LogisticRegression(class_weight='balanced',\n", | |
" random_state=100))])" | |
], | |
"text/html": [ | |
"<style>#sk-c2415f49-b976-43b9-90ca-d55641e74e65 {color: black;background-color: white;}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 pre{padding: 0;}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 div.sk-toggleable {background-color: white;}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 div.sk-estimator:hover {background-color: #d4ebff;}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 2em;bottom: 0;left: 50%;}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 div.sk-item {z-index: 1;}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 div.sk-parallel::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 2em;bottom: 0;left: 50%;}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 div.sk-parallel-item {display: flex;flex-direction: column;position: relative;background-color: white;}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 div.sk-parallel-item:only-child::after {width: 0;}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;position: relative;}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 div.sk-label label {font-family: monospace;font-weight: bold;background-color: white;display: inline-block;line-height: 1.2em;}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 div.sk-label-container {position: relative;z-index: 2;text-align: center;}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-c2415f49-b976-43b9-90ca-d55641e74e65 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-c2415f49-b976-43b9-90ca-d55641e74e65\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>Pipeline(steps=[('lin_all',\n", | |
" ColumnTransformer(n_jobs=-1,\n", | |
" transformers=[('pipeline-1',\n", | |
" Pipeline(steps=[('standard_scaler',\n", | |
" StandardScaler()),\n", | |
" ('median_imputer',\n", | |
" SimpleImputer(strategy='median'))],\n", | |
" verbose=0),\n", | |
" ('person_emp_length',\n", | |
" 'loan_int_rate',\n", | |
" 'loan_percent_income',\n", | |
" 'cb_person_cred_hist_length')),\n", | |
" ('pipeline-2',\n", | |
" Pipeline(steps=[('one_hot_encoder',\n", | |
" OneHotEncoder(handle_unknown='ignore'))],\n", | |
" verbose=0),\n", | |
" <sklearn.compose._column_transformer.make_column_selector object at 0x7fa725c04710>)])),\n", | |
" ('lr',\n", | |
" LogisticRegression(class_weight='balanced',\n", | |
" random_state=100))])</pre><b>Please rerun this cell to show the HTML repr or trust the notebook.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"a8a90c38-81dc-44dd-9d86-34d1337d27ba\" type=\"checkbox\" ><label for=\"a8a90c38-81dc-44dd-9d86-34d1337d27ba\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">Pipeline</label><div class=\"sk-toggleable__content\"><pre>Pipeline(steps=[('lin_all',\n", | |
" ColumnTransformer(n_jobs=-1,\n", | |
" transformers=[('pipeline-1',\n", | |
" Pipeline(steps=[('standard_scaler',\n", | |
" StandardScaler()),\n", | |
" ('median_imputer',\n", | |
" SimpleImputer(strategy='median'))],\n", | |
" verbose=0),\n", | |
" ('person_emp_length',\n", | |
" 'loan_int_rate',\n", | |
" 'loan_percent_income',\n", | |
" 'cb_person_cred_hist_length')),\n", | |
" ('pipeline-2',\n", | |
" Pipeline(steps=[('one_hot_encoder',\n", | |
" OneHotEncoder(handle_unknown='ignore'))],\n", | |
" verbose=0),\n", | |
" <sklearn.compose._column_transformer.make_column_selector object at 0x7fa725c04710>)])),\n", | |
" ('lr',\n", | |
" LogisticRegression(class_weight='balanced',\n", | |
" random_state=100))])</pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"f481e145-71f5-4755-aff3-f835d7ca1eac\" type=\"checkbox\" ><label for=\"f481e145-71f5-4755-aff3-f835d7ca1eac\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">lin_all: ColumnTransformer</label><div class=\"sk-toggleable__content\"><pre>ColumnTransformer(n_jobs=-1,\n", | |
" transformers=[('pipeline-1',\n", | |
" Pipeline(steps=[('standard_scaler',\n", | |
" StandardScaler()),\n", | |
" ('median_imputer',\n", | |
" SimpleImputer(strategy='median'))],\n", | |
" verbose=0),\n", | |
" ('person_emp_length', 'loan_int_rate',\n", | |
" 'loan_percent_income',\n", | |
" 'cb_person_cred_hist_length')),\n", | |
" ('pipeline-2',\n", | |
" Pipeline(steps=[('one_hot_encoder',\n", | |
" OneHotEncoder(handle_unknown='ignore'))],\n", | |
" verbose=0),\n", | |
" <sklearn.compose._column_transformer.make_column_selector object at 0x7fa725c04710>)])</pre></div></div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"fb73bd79-9d35-4eb1-997d-67486c902e0c\" type=\"checkbox\" ><label for=\"fb73bd79-9d35-4eb1-997d-67486c902e0c\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">pipeline-1</label><div class=\"sk-toggleable__content\"><pre>('person_emp_length', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length')</pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"c9a6c290-823c-4f54-9d7e-aa7521ff968e\" type=\"checkbox\" ><label for=\"c9a6c290-823c-4f54-9d7e-aa7521ff968e\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">StandardScaler</label><div class=\"sk-toggleable__content\"><pre>StandardScaler()</pre></div></div></div><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"21bbc557-89cb-4a39-98fc-273dd1ccb1e7\" type=\"checkbox\" ><label for=\"21bbc557-89cb-4a39-98fc-273dd1ccb1e7\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">SimpleImputer</label><div class=\"sk-toggleable__content\"><pre>SimpleImputer(strategy='median')</pre></div></div></div></div></div></div></div></div><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"af704f6c-7cf8-47a6-bce9-5015af274d1b\" type=\"checkbox\" ><label for=\"af704f6c-7cf8-47a6-bce9-5015af274d1b\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">pipeline-2</label><div class=\"sk-toggleable__content\"><pre><sklearn.compose._column_transformer.make_column_selector object at 0x7fa725c04710></pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"62ebd68d-14db-47e5-8160-3f6d677fb1cf\" type=\"checkbox\" ><label for=\"62ebd68d-14db-47e5-8160-3f6d677fb1cf\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">OneHotEncoder</label><div class=\"sk-toggleable__content\"><pre>OneHotEncoder(handle_unknown='ignore')</pre></div></div></div></div></div></div></div></div></div></div><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"423c7e3d-8c6d-4afd-a5bf-6cac442d839e\" type=\"checkbox\" ><label for=\"423c7e3d-8c6d-4afd-a5bf-6cac442d839e\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">LogisticRegression</label><div class=\"sk-toggleable__content\"><pre>LogisticRegression(class_weight='balanced', random_state=100)</pre></div></div></div></div></div></div></div>" | |
] | |
}, | |
"metadata": {} | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Cross validation results for run 'f5e6ab68bb6a402fa26a96ec9c08d56e':\n" | |
] | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/plain": [ | |
" fit_time score_time test_roc_auc train_roc_auc\n", | |
"mean 0.60594 0.12213 0.85527 0.85653\n", | |
"std 0.09781 0.01602 0.01413 0.00339" | |
], | |
"text/html": [ | |
"\n", | |
" <div id=\"df-58ea1ba2-9aa9-4db2-85b2-33e5e4d17df5\">\n", | |
" <div class=\"colab-df-container\">\n", | |
" <div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>fit_time</th>\n", | |
" <th>score_time</th>\n", | |
" <th>test_roc_auc</th>\n", | |
" <th>train_roc_auc</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>mean</th>\n", | |
" <td>0.60594</td>\n", | |
" <td>0.12213</td>\n", | |
" <td>0.85527</td>\n", | |
" <td>0.85653</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>std</th>\n", | |
" <td>0.09781</td>\n", | |
" <td>0.01602</td>\n", | |
" <td>0.01413</td>\n", | |
" <td>0.00339</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>\n", | |
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-58ea1ba2-9aa9-4db2-85b2-33e5e4d17df5')\"\n", | |
" title=\"Convert this dataframe to an interactive table.\"\n", | |
" style=\"display:none;\">\n", | |
" \n", | |
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", | |
" width=\"24px\">\n", | |
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n", | |
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n", | |
" </svg>\n", | |
" </button>\n", | |
" \n", | |
" <style>\n", | |
" .colab-df-container {\n", | |
" display:flex;\n", | |
" flex-wrap:wrap;\n", | |
" gap: 12px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert {\n", | |
" background-color: #E8F0FE;\n", | |
" border: none;\n", | |
" border-radius: 50%;\n", | |
" cursor: pointer;\n", | |
" display: none;\n", | |
" fill: #1967D2;\n", | |
" height: 32px;\n", | |
" padding: 0 0 0 0;\n", | |
" width: 32px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert:hover {\n", | |
" background-color: #E2EBFA;\n", | |
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
" fill: #174EA6;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert {\n", | |
" background-color: #3B4455;\n", | |
" fill: #D2E3FC;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert:hover {\n", | |
" background-color: #434B5C;\n", | |
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", | |
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", | |
" fill: #FFFFFF;\n", | |
" }\n", | |
" </style>\n", | |
"\n", | |
" <script>\n", | |
" const buttonEl =\n", | |
" document.querySelector('#df-58ea1ba2-9aa9-4db2-85b2-33e5e4d17df5 button.colab-df-convert');\n", | |
" buttonEl.style.display =\n", | |
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
"\n", | |
" async function convertToInteractive(key) {\n", | |
" const element = document.querySelector('#df-58ea1ba2-9aa9-4db2-85b2-33e5e4d17df5');\n", | |
" const dataTable =\n", | |
" await google.colab.kernel.invokeFunction('convertToInteractive',\n", | |
" [key], {});\n", | |
" if (!dataTable) return;\n", | |
"\n", | |
" const docLinkHtml = 'Like what you see? Visit the ' +\n", | |
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", | |
" + ' to learn more about interactive tables.';\n", | |
" element.innerHTML = '';\n", | |
" dataTable['output_type'] = 'display_data';\n", | |
" await google.colab.output.renderOutput(dataTable, element);\n", | |
" const docLink = document.createElement('div');\n", | |
" docLink.innerHTML = docLinkHtml;\n", | |
" element.appendChild(docLink);\n", | |
" }\n", | |
" </script>\n", | |
" </div>\n", | |
" </div>\n", | |
" " | |
] | |
}, | |
"metadata": {} | |
} | |
], | |
"source": [ | |
"lr_run_id = train(\"lr\", \"lin_all\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "57e014de", | |
"metadata": { | |
"papermill": { | |
"duration": 0.009672, | |
"end_time": "2022-10-11T08:53:47.357236", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:47.347564", | |
"status": "completed" | |
}, | |
"tags": [], | |
"id": "57e014de" | |
}, | |
"source": [ | |
"## Decision Tree\n", | |
"Just like its name suggests, this is a tree model. Read more about it on the [scikit-learn documentation](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html).\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"id": "60a1a916", | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2022-10-11T08:53:47.378864Z", | |
"iopub.status.busy": "2022-10-11T08:53:47.378417Z", | |
"iopub.status.idle": "2022-10-11T08:53:48.424302Z", | |
"shell.execute_reply": "2022-10-11T08:53:48.423250Z" | |
}, | |
"papermill": { | |
"duration": 1.059881, | |
"end_time": "2022-10-11T08:53:48.427152", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:47.367271", | |
"status": "completed" | |
}, | |
"tags": [], | |
"id": "60a1a916", | |
"outputId": "8a0fce2b-f675-41c1-ef48-33ca9ee64eeb", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 302 | |
} | |
}, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/plain": [ | |
"Pipeline(steps=[('tree_all',\n", | |
" ColumnTransformer(n_jobs=-1,\n", | |
" transformers=[('pipeline-1',\n", | |
" Pipeline(steps=[('median_imputer',\n", | |
" SimpleImputer(strategy='median'))],\n", | |
" verbose=0),\n", | |
" ('person_income',\n", | |
" 'person_emp_length',\n", | |
" 'loan_amnt', 'loan_int_rate',\n", | |
" 'cb_person_cred_hist_length')),\n", | |
" ('pipeline-2',\n", | |
" Pipeline(steps=[('ordinal_encoder',\n", | |
" OrdinalEncoder(handle_unknown='use_encoded_value',\n", | |
" unknown_value=-999))],\n", | |
" verbose=0),\n", | |
" <sklearn.compose._column_transformer.make_column_selector object at 0x7fa725c047d0>)])),\n", | |
" ('dt', DecisionTreeClassifier(random_state=100))])" | |
], | |
"text/html": [ | |
"<style>#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 {color: black;background-color: white;}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 pre{padding: 0;}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 div.sk-toggleable {background-color: white;}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 div.sk-estimator:hover {background-color: #d4ebff;}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 2em;bottom: 0;left: 50%;}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 div.sk-item {z-index: 1;}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 div.sk-parallel::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 2em;bottom: 0;left: 50%;}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 div.sk-parallel-item {display: flex;flex-direction: column;position: relative;background-color: white;}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 div.sk-parallel-item:only-child::after {width: 0;}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;position: relative;}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 div.sk-label label {font-family: monospace;font-weight: bold;background-color: white;display: inline-block;line-height: 1.2em;}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 div.sk-label-container {position: relative;z-index: 2;text-align: center;}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-846f548e-9e6a-4d93-91fd-bf72733643d4 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-846f548e-9e6a-4d93-91fd-bf72733643d4\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>Pipeline(steps=[('tree_all',\n", | |
" ColumnTransformer(n_jobs=-1,\n", | |
" transformers=[('pipeline-1',\n", | |
" Pipeline(steps=[('median_imputer',\n", | |
" SimpleImputer(strategy='median'))],\n", | |
" verbose=0),\n", | |
" ('person_income',\n", | |
" 'person_emp_length',\n", | |
" 'loan_amnt', 'loan_int_rate',\n", | |
" 'cb_person_cred_hist_length')),\n", | |
" ('pipeline-2',\n", | |
" Pipeline(steps=[('ordinal_encoder',\n", | |
" OrdinalEncoder(handle_unknown='use_encoded_value',\n", | |
" unknown_value=-999))],\n", | |
" verbose=0),\n", | |
" <sklearn.compose._column_transformer.make_column_selector object at 0x7fa725c047d0>)])),\n", | |
" ('dt', DecisionTreeClassifier(random_state=100))])</pre><b>Please rerun this cell to show the HTML repr or trust the notebook.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"4b388e14-d1ee-4f8e-9b67-9a2d172d6ad8\" type=\"checkbox\" ><label for=\"4b388e14-d1ee-4f8e-9b67-9a2d172d6ad8\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">Pipeline</label><div class=\"sk-toggleable__content\"><pre>Pipeline(steps=[('tree_all',\n", | |
" ColumnTransformer(n_jobs=-1,\n", | |
" transformers=[('pipeline-1',\n", | |
" Pipeline(steps=[('median_imputer',\n", | |
" SimpleImputer(strategy='median'))],\n", | |
" verbose=0),\n", | |
" ('person_income',\n", | |
" 'person_emp_length',\n", | |
" 'loan_amnt', 'loan_int_rate',\n", | |
" 'cb_person_cred_hist_length')),\n", | |
" ('pipeline-2',\n", | |
" Pipeline(steps=[('ordinal_encoder',\n", | |
" OrdinalEncoder(handle_unknown='use_encoded_value',\n", | |
" unknown_value=-999))],\n", | |
" verbose=0),\n", | |
" <sklearn.compose._column_transformer.make_column_selector object at 0x7fa725c047d0>)])),\n", | |
" ('dt', DecisionTreeClassifier(random_state=100))])</pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"0f706f84-9e73-4dc1-8121-6bdcaabf21d8\" type=\"checkbox\" ><label for=\"0f706f84-9e73-4dc1-8121-6bdcaabf21d8\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">tree_all: ColumnTransformer</label><div class=\"sk-toggleable__content\"><pre>ColumnTransformer(n_jobs=-1,\n", | |
" transformers=[('pipeline-1',\n", | |
" Pipeline(steps=[('median_imputer',\n", | |
" SimpleImputer(strategy='median'))],\n", | |
" verbose=0),\n", | |
" ('person_income', 'person_emp_length',\n", | |
" 'loan_amnt', 'loan_int_rate',\n", | |
" 'cb_person_cred_hist_length')),\n", | |
" ('pipeline-2',\n", | |
" Pipeline(steps=[('ordinal_encoder',\n", | |
" OrdinalEncoder(handle_unknown='use_encoded_value',\n", | |
" unknown_value=-999))],\n", | |
" verbose=0),\n", | |
" <sklearn.compose._column_transformer.make_column_selector object at 0x7fa725c047d0>)])</pre></div></div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"5d4031d5-7cd6-40f0-b7ca-1bb49eb2ad56\" type=\"checkbox\" ><label for=\"5d4031d5-7cd6-40f0-b7ca-1bb49eb2ad56\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">pipeline-1</label><div class=\"sk-toggleable__content\"><pre>('person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'cb_person_cred_hist_length')</pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"63172cf2-74d5-4049-a66a-7c6e03a2594a\" type=\"checkbox\" ><label for=\"63172cf2-74d5-4049-a66a-7c6e03a2594a\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">SimpleImputer</label><div class=\"sk-toggleable__content\"><pre>SimpleImputer(strategy='median')</pre></div></div></div></div></div></div></div></div><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"b1ec853c-a6c0-44b9-852f-75e05cb1c01e\" type=\"checkbox\" ><label for=\"b1ec853c-a6c0-44b9-852f-75e05cb1c01e\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">pipeline-2</label><div class=\"sk-toggleable__content\"><pre><sklearn.compose._column_transformer.make_column_selector object at 0x7fa725c047d0></pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"2c84e8b1-cd9f-475e-91be-1bb732204b84\" type=\"checkbox\" ><label for=\"2c84e8b1-cd9f-475e-91be-1bb732204b84\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">OrdinalEncoder</label><div class=\"sk-toggleable__content\"><pre>OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-999)</pre></div></div></div></div></div></div></div></div></div></div><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"2a8179a7-e5ae-485f-9a04-e2dbd54dc20e\" type=\"checkbox\" ><label for=\"2a8179a7-e5ae-485f-9a04-e2dbd54dc20e\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">DecisionTreeClassifier</label><div class=\"sk-toggleable__content\"><pre>DecisionTreeClassifier(random_state=100)</pre></div></div></div></div></div></div></div>" | |
] | |
}, | |
"metadata": {} | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Cross validation results for run '467f34a84abf400eb0203166fc4ef144':\n" | |
] | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/plain": [ | |
" fit_time score_time test_roc_auc train_roc_auc\n", | |
"mean 0.37680 0.13636 0.82854 1.0\n", | |
"std 0.04658 0.01200 0.00785 0.0" | |
], | |
"text/html": [ | |
"\n", | |
" <div id=\"df-fa0d959f-8bf9-4ca4-a425-7d62abd15be4\">\n", | |
" <div class=\"colab-df-container\">\n", | |
" <div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>fit_time</th>\n", | |
" <th>score_time</th>\n", | |
" <th>test_roc_auc</th>\n", | |
" <th>train_roc_auc</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>mean</th>\n", | |
" <td>0.37680</td>\n", | |
" <td>0.13636</td>\n", | |
" <td>0.82854</td>\n", | |
" <td>1.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>std</th>\n", | |
" <td>0.04658</td>\n", | |
" <td>0.01200</td>\n", | |
" <td>0.00785</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>\n", | |
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-fa0d959f-8bf9-4ca4-a425-7d62abd15be4')\"\n", | |
" title=\"Convert this dataframe to an interactive table.\"\n", | |
" style=\"display:none;\">\n", | |
" \n", | |
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", | |
" width=\"24px\">\n", | |
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n", | |
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n", | |
" </svg>\n", | |
" </button>\n", | |
" \n", | |
" <style>\n", | |
" .colab-df-container {\n", | |
" display:flex;\n", | |
" flex-wrap:wrap;\n", | |
" gap: 12px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert {\n", | |
" background-color: #E8F0FE;\n", | |
" border: none;\n", | |
" border-radius: 50%;\n", | |
" cursor: pointer;\n", | |
" display: none;\n", | |
" fill: #1967D2;\n", | |
" height: 32px;\n", | |
" padding: 0 0 0 0;\n", | |
" width: 32px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert:hover {\n", | |
" background-color: #E2EBFA;\n", | |
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
" fill: #174EA6;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert {\n", | |
" background-color: #3B4455;\n", | |
" fill: #D2E3FC;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert:hover {\n", | |
" background-color: #434B5C;\n", | |
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", | |
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", | |
" fill: #FFFFFF;\n", | |
" }\n", | |
" </style>\n", | |
"\n", | |
" <script>\n", | |
" const buttonEl =\n", | |
" document.querySelector('#df-fa0d959f-8bf9-4ca4-a425-7d62abd15be4 button.colab-df-convert');\n", | |
" buttonEl.style.display =\n", | |
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
"\n", | |
" async function convertToInteractive(key) {\n", | |
" const element = document.querySelector('#df-fa0d959f-8bf9-4ca4-a425-7d62abd15be4');\n", | |
" const dataTable =\n", | |
" await google.colab.kernel.invokeFunction('convertToInteractive',\n", | |
" [key], {});\n", | |
" if (!dataTable) return;\n", | |
"\n", | |
" const docLinkHtml = 'Like what you see? Visit the ' +\n", | |
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", | |
" + ' to learn more about interactive tables.';\n", | |
" element.innerHTML = '';\n", | |
" dataTable['output_type'] = 'display_data';\n", | |
" await google.colab.output.renderOutput(dataTable, element);\n", | |
" const docLink = document.createElement('div');\n", | |
" docLink.innerHTML = docLinkHtml;\n", | |
" element.appendChild(docLink);\n", | |
" }\n", | |
" </script>\n", | |
" </div>\n", | |
" </div>\n", | |
" " | |
] | |
}, | |
"metadata": {} | |
} | |
], | |
"source": [ | |
"dt_run_id = train(\"dt\", \"tree_all\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "d036a723", | |
"metadata": { | |
"papermill": { | |
"duration": 0.00984, | |
"end_time": "2022-10-11T08:53:48.447371", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:48.437531", | |
"status": "completed" | |
}, | |
"tags": [], | |
"id": "d036a723" | |
}, | |
"source": [ | |
"## Ensemble methods\n", | |
"There are several types of ensemble methods. They include:\n", | |
"- Bagging(short form for *bootstrap aggregation*)\n", | |
"- Boosting\n", | |
"- Pasting\n", | |
"- Random patches\n", | |
"- Random subspaces\n", | |
"- Stacking\n", | |
"\n", | |
"We shall cover the first two in this lesson." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "d31a6019", | |
"metadata": { | |
"papermill": { | |
"duration": 0.009815, | |
"end_time": "2022-10-11T08:53:48.467381", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:48.457566", | |
"status": "completed" | |
}, | |
"tags": [], | |
"id": "d31a6019" | |
}, | |
"source": [ | |
"## Random Forest Classifier\n", | |
"This is an ensemble model that works by bagging. Read more about it on the [scikit-learn documentation](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"id": "c61d7dbc", | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2022-10-11T08:53:48.490479Z", | |
"iopub.status.busy": "2022-10-11T08:53:48.489269Z", | |
"iopub.status.idle": "2022-10-11T08:53:55.744429Z", | |
"shell.execute_reply": "2022-10-11T08:53:55.742921Z" | |
}, | |
"papermill": { | |
"duration": 7.270292, | |
"end_time": "2022-10-11T08:53:55.747694", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:48.477402", | |
"status": "completed" | |
}, | |
"tags": [], | |
"id": "c61d7dbc", | |
"outputId": "1cd304b4-7689-48e3-8705-3410cd422ccb", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 302 | |
} | |
}, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/plain": [ | |
"Pipeline(steps=[('tree_all',\n", | |
" ColumnTransformer(n_jobs=-1,\n", | |
" transformers=[('pipeline-1',\n", | |
" Pipeline(steps=[('median_imputer',\n", | |
" SimpleImputer(strategy='median'))],\n", | |
" verbose=0),\n", | |
" ('person_income',\n", | |
" 'person_emp_length',\n", | |
" 'loan_amnt', 'loan_int_rate',\n", | |
" 'cb_person_cred_hist_length')),\n", | |
" ('pipeline-2',\n", | |
" Pipeline(steps=[('ordinal_encoder',\n", | |
" OrdinalEncoder(handle_unknown='use_encoded_value',\n", | |
" unknown_value=-999))],\n", | |
" verbose=0),\n", | |
" <sklearn.compose._column_transformer.make_column_selector object at 0x7fa725c047d0>)])),\n", | |
" ('rf', RandomForestClassifier(n_jobs=-1, random_state=100))])" | |
], | |
"text/html": [ | |
"<style>#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d {color: black;background-color: white;}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d pre{padding: 0;}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d div.sk-toggleable {background-color: white;}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d div.sk-estimator:hover {background-color: #d4ebff;}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 2em;bottom: 0;left: 50%;}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d div.sk-item {z-index: 1;}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d div.sk-parallel::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 2em;bottom: 0;left: 50%;}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d div.sk-parallel-item {display: flex;flex-direction: column;position: relative;background-color: white;}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d div.sk-parallel-item:only-child::after {width: 0;}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;position: relative;}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d div.sk-label label {font-family: monospace;font-weight: bold;background-color: white;display: inline-block;line-height: 1.2em;}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d div.sk-label-container {position: relative;z-index: 2;text-align: center;}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-d7fde94f-3592-49ba-96c1-c38d54ae5f0d\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>Pipeline(steps=[('tree_all',\n", | |
" ColumnTransformer(n_jobs=-1,\n", | |
" transformers=[('pipeline-1',\n", | |
" Pipeline(steps=[('median_imputer',\n", | |
" SimpleImputer(strategy='median'))],\n", | |
" verbose=0),\n", | |
" ('person_income',\n", | |
" 'person_emp_length',\n", | |
" 'loan_amnt', 'loan_int_rate',\n", | |
" 'cb_person_cred_hist_length')),\n", | |
" ('pipeline-2',\n", | |
" Pipeline(steps=[('ordinal_encoder',\n", | |
" OrdinalEncoder(handle_unknown='use_encoded_value',\n", | |
" unknown_value=-999))],\n", | |
" verbose=0),\n", | |
" <sklearn.compose._column_transformer.make_column_selector object at 0x7fa725c047d0>)])),\n", | |
" ('rf', RandomForestClassifier(n_jobs=-1, random_state=100))])</pre><b>Please rerun this cell to show the HTML repr or trust the notebook.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"55d3eb18-6a6c-4711-bb8b-8543702a954e\" type=\"checkbox\" ><label for=\"55d3eb18-6a6c-4711-bb8b-8543702a954e\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">Pipeline</label><div class=\"sk-toggleable__content\"><pre>Pipeline(steps=[('tree_all',\n", | |
" ColumnTransformer(n_jobs=-1,\n", | |
" transformers=[('pipeline-1',\n", | |
" Pipeline(steps=[('median_imputer',\n", | |
" SimpleImputer(strategy='median'))],\n", | |
" verbose=0),\n", | |
" ('person_income',\n", | |
" 'person_emp_length',\n", | |
" 'loan_amnt', 'loan_int_rate',\n", | |
" 'cb_person_cred_hist_length')),\n", | |
" ('pipeline-2',\n", | |
" Pipeline(steps=[('ordinal_encoder',\n", | |
" OrdinalEncoder(handle_unknown='use_encoded_value',\n", | |
" unknown_value=-999))],\n", | |
" verbose=0),\n", | |
" <sklearn.compose._column_transformer.make_column_selector object at 0x7fa725c047d0>)])),\n", | |
" ('rf', RandomForestClassifier(n_jobs=-1, random_state=100))])</pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"f8b48d66-6bf8-4eac-bf50-29e9d0951b9f\" type=\"checkbox\" ><label for=\"f8b48d66-6bf8-4eac-bf50-29e9d0951b9f\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">tree_all: ColumnTransformer</label><div class=\"sk-toggleable__content\"><pre>ColumnTransformer(n_jobs=-1,\n", | |
" transformers=[('pipeline-1',\n", | |
" Pipeline(steps=[('median_imputer',\n", | |
" SimpleImputer(strategy='median'))],\n", | |
" verbose=0),\n", | |
" ('person_income', 'person_emp_length',\n", | |
" 'loan_amnt', 'loan_int_rate',\n", | |
" 'cb_person_cred_hist_length')),\n", | |
" ('pipeline-2',\n", | |
" Pipeline(steps=[('ordinal_encoder',\n", | |
" OrdinalEncoder(handle_unknown='use_encoded_value',\n", | |
" unknown_value=-999))],\n", | |
" verbose=0),\n", | |
" <sklearn.compose._column_transformer.make_column_selector object at 0x7fa725c047d0>)])</pre></div></div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"03d439c6-9b72-4f1d-9780-628ec37740e5\" type=\"checkbox\" ><label for=\"03d439c6-9b72-4f1d-9780-628ec37740e5\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">pipeline-1</label><div class=\"sk-toggleable__content\"><pre>('person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'cb_person_cred_hist_length')</pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"2a04fd09-9599-4698-a602-9f79d268805b\" type=\"checkbox\" ><label for=\"2a04fd09-9599-4698-a602-9f79d268805b\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">SimpleImputer</label><div class=\"sk-toggleable__content\"><pre>SimpleImputer(strategy='median')</pre></div></div></div></div></div></div></div></div><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"26bcf123-623b-42d7-be4a-310173595f30\" type=\"checkbox\" ><label for=\"26bcf123-623b-42d7-be4a-310173595f30\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">pipeline-2</label><div class=\"sk-toggleable__content\"><pre><sklearn.compose._column_transformer.make_column_selector object at 0x7fa725c047d0></pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"e8197e7e-b90a-4ce9-875c-769ed2da20fe\" type=\"checkbox\" ><label for=\"e8197e7e-b90a-4ce9-875c-769ed2da20fe\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">OrdinalEncoder</label><div class=\"sk-toggleable__content\"><pre>OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-999)</pre></div></div></div></div></div></div></div></div></div></div><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"10fd1f5e-dd4f-40e1-855a-633975e5e97c\" type=\"checkbox\" ><label for=\"10fd1f5e-dd4f-40e1-855a-633975e5e97c\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">RandomForestClassifier</label><div class=\"sk-toggleable__content\"><pre>RandomForestClassifier(n_jobs=-1, random_state=100)</pre></div></div></div></div></div></div></div>" | |
] | |
}, | |
"metadata": {} | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Cross validation results for run '2a3f064842ca45d7bf8544f46031d12d':\n" | |
] | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/plain": [ | |
" fit_time score_time test_roc_auc train_roc_auc\n", | |
"mean 6.10923 0.39508 0.92497 1.0\n", | |
"std 1.13856 0.12407 0.00634 0.0" | |
], | |
"text/html": [ | |
"\n", | |
" <div id=\"df-72d7d98a-ca04-4e22-bb6f-7b9036bf8a42\">\n", | |
" <div class=\"colab-df-container\">\n", | |
" <div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>fit_time</th>\n", | |
" <th>score_time</th>\n", | |
" <th>test_roc_auc</th>\n", | |
" <th>train_roc_auc</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>mean</th>\n", | |
" <td>6.10923</td>\n", | |
" <td>0.39508</td>\n", | |
" <td>0.92497</td>\n", | |
" <td>1.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>std</th>\n", | |
" <td>1.13856</td>\n", | |
" <td>0.12407</td>\n", | |
" <td>0.00634</td>\n", | |
" <td>0.0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>\n", | |
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-72d7d98a-ca04-4e22-bb6f-7b9036bf8a42')\"\n", | |
" title=\"Convert this dataframe to an interactive table.\"\n", | |
" style=\"display:none;\">\n", | |
" \n", | |
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", | |
" width=\"24px\">\n", | |
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n", | |
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n", | |
" </svg>\n", | |
" </button>\n", | |
" \n", | |
" <style>\n", | |
" .colab-df-container {\n", | |
" display:flex;\n", | |
" flex-wrap:wrap;\n", | |
" gap: 12px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert {\n", | |
" background-color: #E8F0FE;\n", | |
" border: none;\n", | |
" border-radius: 50%;\n", | |
" cursor: pointer;\n", | |
" display: none;\n", | |
" fill: #1967D2;\n", | |
" height: 32px;\n", | |
" padding: 0 0 0 0;\n", | |
" width: 32px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert:hover {\n", | |
" background-color: #E2EBFA;\n", | |
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
" fill: #174EA6;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert {\n", | |
" background-color: #3B4455;\n", | |
" fill: #D2E3FC;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert:hover {\n", | |
" background-color: #434B5C;\n", | |
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", | |
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", | |
" fill: #FFFFFF;\n", | |
" }\n", | |
" </style>\n", | |
"\n", | |
" <script>\n", | |
" const buttonEl =\n", | |
" document.querySelector('#df-72d7d98a-ca04-4e22-bb6f-7b9036bf8a42 button.colab-df-convert');\n", | |
" buttonEl.style.display =\n", | |
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
"\n", | |
" async function convertToInteractive(key) {\n", | |
" const element = document.querySelector('#df-72d7d98a-ca04-4e22-bb6f-7b9036bf8a42');\n", | |
" const dataTable =\n", | |
" await google.colab.kernel.invokeFunction('convertToInteractive',\n", | |
" [key], {});\n", | |
" if (!dataTable) return;\n", | |
"\n", | |
" const docLinkHtml = 'Like what you see? Visit the ' +\n", | |
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", | |
" + ' to learn more about interactive tables.';\n", | |
" element.innerHTML = '';\n", | |
" dataTable['output_type'] = 'display_data';\n", | |
" await google.colab.output.renderOutput(dataTable, element);\n", | |
" const docLink = document.createElement('div');\n", | |
" docLink.innerHTML = docLinkHtml;\n", | |
" element.appendChild(docLink);\n", | |
" }\n", | |
" </script>\n", | |
" </div>\n", | |
" </div>\n", | |
" " | |
] | |
}, | |
"metadata": {} | |
} | |
], | |
"source": [ | |
"rf_run_id = train(\"rf\", \"tree_all\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "753debf4", | |
"metadata": { | |
"papermill": { | |
"duration": 0.010578, | |
"end_time": "2022-10-11T08:53:55.769146", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:55.758568", | |
"status": "completed" | |
}, | |
"tags": [], | |
"id": "753debf4" | |
}, | |
"source": [ | |
"## Histogram-based Gradient Boosting Classifier\n", | |
"This is an ensemble model that works by boosting. Read more about it on the [scikit-learn documentation](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingClassifier.html).\n", | |
"\n", | |
"Other examples of boosted ensemble models include: [XGBoost](https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBClassifier), [CatBoost](https://catboost.ai/en/docs/concepts/python-reference_catboostclassifier) and [LightGBM](https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html). They are not part of scikit-learn, but implement the scikit-learn interface. You can experiment with them." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"id": "d823b321", | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2022-10-11T08:53:55.792882Z", | |
"iopub.status.busy": "2022-10-11T08:53:55.792437Z", | |
"iopub.status.idle": "2022-10-11T08:53:59.308895Z", | |
"shell.execute_reply": "2022-10-11T08:53:59.307782Z" | |
}, | |
"papermill": { | |
"duration": 3.531598, | |
"end_time": "2022-10-11T08:53:59.311573", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:55.779975", | |
"status": "completed" | |
}, | |
"tags": [], | |
"id": "d823b321", | |
"outputId": "25e9b2ab-9b0b-47ff-ed29-2b66febaebff", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 302 | |
} | |
}, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/plain": [ | |
"Pipeline(steps=[('tree_all',\n", | |
" ColumnTransformer(n_jobs=-1,\n", | |
" transformers=[('pipeline-1',\n", | |
" Pipeline(steps=[('median_imputer',\n", | |
" SimpleImputer(strategy='median'))],\n", | |
" verbose=0),\n", | |
" ('person_income',\n", | |
" 'person_emp_length',\n", | |
" 'loan_amnt', 'loan_int_rate',\n", | |
" 'cb_person_cred_hist_length')),\n", | |
" ('pipeline-2',\n", | |
" Pipeline(steps=[('ordinal_encoder',\n", | |
" OrdinalEncoder(handle_unknown='use_encoded_value',\n", | |
" unknown_value=-999))],\n", | |
" verbose=0),\n", | |
" <sklearn.compose._column_transformer.make_column_selector object at 0x7fa725c047d0>)])),\n", | |
" ('hgb', HistGradientBoostingClassifier(random_state=100))])" | |
], | |
"text/html": [ | |
"<style>#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc {color: black;background-color: white;}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc pre{padding: 0;}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc div.sk-toggleable {background-color: white;}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc div.sk-estimator:hover {background-color: #d4ebff;}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 2em;bottom: 0;left: 50%;}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc div.sk-item {z-index: 1;}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc div.sk-parallel::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 2em;bottom: 0;left: 50%;}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc div.sk-parallel-item {display: flex;flex-direction: column;position: relative;background-color: white;}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc div.sk-parallel-item:only-child::after {width: 0;}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;position: relative;}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc div.sk-label label {font-family: monospace;font-weight: bold;background-color: white;display: inline-block;line-height: 1.2em;}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc div.sk-label-container {position: relative;z-index: 2;text-align: center;}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-eb01de65-fc9f-4c90-8ea1-8acf320e30dc\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>Pipeline(steps=[('tree_all',\n", | |
" ColumnTransformer(n_jobs=-1,\n", | |
" transformers=[('pipeline-1',\n", | |
" Pipeline(steps=[('median_imputer',\n", | |
" SimpleImputer(strategy='median'))],\n", | |
" verbose=0),\n", | |
" ('person_income',\n", | |
" 'person_emp_length',\n", | |
" 'loan_amnt', 'loan_int_rate',\n", | |
" 'cb_person_cred_hist_length')),\n", | |
" ('pipeline-2',\n", | |
" Pipeline(steps=[('ordinal_encoder',\n", | |
" OrdinalEncoder(handle_unknown='use_encoded_value',\n", | |
" unknown_value=-999))],\n", | |
" verbose=0),\n", | |
" <sklearn.compose._column_transformer.make_column_selector object at 0x7fa725c047d0>)])),\n", | |
" ('hgb', HistGradientBoostingClassifier(random_state=100))])</pre><b>Please rerun this cell to show the HTML repr or trust the notebook.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"d6bb89cf-8fb0-46d7-a0c5-ad0644d9aa16\" type=\"checkbox\" ><label for=\"d6bb89cf-8fb0-46d7-a0c5-ad0644d9aa16\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">Pipeline</label><div class=\"sk-toggleable__content\"><pre>Pipeline(steps=[('tree_all',\n", | |
" ColumnTransformer(n_jobs=-1,\n", | |
" transformers=[('pipeline-1',\n", | |
" Pipeline(steps=[('median_imputer',\n", | |
" SimpleImputer(strategy='median'))],\n", | |
" verbose=0),\n", | |
" ('person_income',\n", | |
" 'person_emp_length',\n", | |
" 'loan_amnt', 'loan_int_rate',\n", | |
" 'cb_person_cred_hist_length')),\n", | |
" ('pipeline-2',\n", | |
" Pipeline(steps=[('ordinal_encoder',\n", | |
" OrdinalEncoder(handle_unknown='use_encoded_value',\n", | |
" unknown_value=-999))],\n", | |
" verbose=0),\n", | |
" <sklearn.compose._column_transformer.make_column_selector object at 0x7fa725c047d0>)])),\n", | |
" ('hgb', HistGradientBoostingClassifier(random_state=100))])</pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"35a34cf6-d2cb-4a03-98a7-2a645ca50409\" type=\"checkbox\" ><label for=\"35a34cf6-d2cb-4a03-98a7-2a645ca50409\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">tree_all: ColumnTransformer</label><div class=\"sk-toggleable__content\"><pre>ColumnTransformer(n_jobs=-1,\n", | |
" transformers=[('pipeline-1',\n", | |
" Pipeline(steps=[('median_imputer',\n", | |
" SimpleImputer(strategy='median'))],\n", | |
" verbose=0),\n", | |
" ('person_income', 'person_emp_length',\n", | |
" 'loan_amnt', 'loan_int_rate',\n", | |
" 'cb_person_cred_hist_length')),\n", | |
" ('pipeline-2',\n", | |
" Pipeline(steps=[('ordinal_encoder',\n", | |
" OrdinalEncoder(handle_unknown='use_encoded_value',\n", | |
" unknown_value=-999))],\n", | |
" verbose=0),\n", | |
" <sklearn.compose._column_transformer.make_column_selector object at 0x7fa725c047d0>)])</pre></div></div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"a6ef9c67-b785-4f56-9eee-9dc4b530bc31\" type=\"checkbox\" ><label for=\"a6ef9c67-b785-4f56-9eee-9dc4b530bc31\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">pipeline-1</label><div class=\"sk-toggleable__content\"><pre>('person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'cb_person_cred_hist_length')</pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"0c9003e7-20d0-49a2-a769-d7a75896e6a1\" type=\"checkbox\" ><label for=\"0c9003e7-20d0-49a2-a769-d7a75896e6a1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">SimpleImputer</label><div class=\"sk-toggleable__content\"><pre>SimpleImputer(strategy='median')</pre></div></div></div></div></div></div></div></div><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"e7c520bd-f2a5-4fed-85ba-2b92375335dc\" type=\"checkbox\" ><label for=\"e7c520bd-f2a5-4fed-85ba-2b92375335dc\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">pipeline-2</label><div class=\"sk-toggleable__content\"><pre><sklearn.compose._column_transformer.make_column_selector object at 0x7fa725c047d0></pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"f2641d8d-37dd-4a02-a097-24a1e8de8a42\" type=\"checkbox\" ><label for=\"f2641d8d-37dd-4a02-a097-24a1e8de8a42\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">OrdinalEncoder</label><div class=\"sk-toggleable__content\"><pre>OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-999)</pre></div></div></div></div></div></div></div></div></div></div><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"84709507-0a89-416b-9fc7-5274e8d6d34f\" type=\"checkbox\" ><label for=\"84709507-0a89-416b-9fc7-5274e8d6d34f\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">HistGradientBoostingClassifier</label><div class=\"sk-toggleable__content\"><pre>HistGradientBoostingClassifier(random_state=100)</pre></div></div></div></div></div></div></div>" | |
] | |
}, | |
"metadata": {} | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Cross validation results for run '8dd61f42592647bbb32eddf29816e9fb':\n" | |
] | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/plain": [ | |
" fit_time score_time test_roc_auc train_roc_auc\n", | |
"mean 1.06248 0.16568 0.93747 0.97089\n", | |
"std 0.18190 0.01227 0.00549 0.00077" | |
], | |
"text/html": [ | |
"\n", | |
" <div id=\"df-5a402b95-ed1c-4a65-831d-f9521c95ca49\">\n", | |
" <div class=\"colab-df-container\">\n", | |
" <div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>fit_time</th>\n", | |
" <th>score_time</th>\n", | |
" <th>test_roc_auc</th>\n", | |
" <th>train_roc_auc</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>mean</th>\n", | |
" <td>1.06248</td>\n", | |
" <td>0.16568</td>\n", | |
" <td>0.93747</td>\n", | |
" <td>0.97089</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>std</th>\n", | |
" <td>0.18190</td>\n", | |
" <td>0.01227</td>\n", | |
" <td>0.00549</td>\n", | |
" <td>0.00077</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>\n", | |
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-5a402b95-ed1c-4a65-831d-f9521c95ca49')\"\n", | |
" title=\"Convert this dataframe to an interactive table.\"\n", | |
" style=\"display:none;\">\n", | |
" \n", | |
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", | |
" width=\"24px\">\n", | |
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n", | |
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n", | |
" </svg>\n", | |
" </button>\n", | |
" \n", | |
" <style>\n", | |
" .colab-df-container {\n", | |
" display:flex;\n", | |
" flex-wrap:wrap;\n", | |
" gap: 12px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert {\n", | |
" background-color: #E8F0FE;\n", | |
" border: none;\n", | |
" border-radius: 50%;\n", | |
" cursor: pointer;\n", | |
" display: none;\n", | |
" fill: #1967D2;\n", | |
" height: 32px;\n", | |
" padding: 0 0 0 0;\n", | |
" width: 32px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert:hover {\n", | |
" background-color: #E2EBFA;\n", | |
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
" fill: #174EA6;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert {\n", | |
" background-color: #3B4455;\n", | |
" fill: #D2E3FC;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert:hover {\n", | |
" background-color: #434B5C;\n", | |
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", | |
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", | |
" fill: #FFFFFF;\n", | |
" }\n", | |
" </style>\n", | |
"\n", | |
" <script>\n", | |
" const buttonEl =\n", | |
" document.querySelector('#df-5a402b95-ed1c-4a65-831d-f9521c95ca49 button.colab-df-convert');\n", | |
" buttonEl.style.display =\n", | |
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
"\n", | |
" async function convertToInteractive(key) {\n", | |
" const element = document.querySelector('#df-5a402b95-ed1c-4a65-831d-f9521c95ca49');\n", | |
" const dataTable =\n", | |
" await google.colab.kernel.invokeFunction('convertToInteractive',\n", | |
" [key], {});\n", | |
" if (!dataTable) return;\n", | |
"\n", | |
" const docLinkHtml = 'Like what you see? Visit the ' +\n", | |
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", | |
" + ' to learn more about interactive tables.';\n", | |
" element.innerHTML = '';\n", | |
" dataTable['output_type'] = 'display_data';\n", | |
" await google.colab.output.renderOutput(dataTable, element);\n", | |
" const docLink = document.createElement('div');\n", | |
" docLink.innerHTML = docLinkHtml;\n", | |
" element.appendChild(docLink);\n", | |
" }\n", | |
" </script>\n", | |
" </div>\n", | |
" </div>\n", | |
" " | |
] | |
}, | |
"metadata": {} | |
} | |
], | |
"source": [ | |
"hgb_run_id = train(\"hgb\", \"tree_all\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "ae91b8a7", | |
"metadata": { | |
"papermill": { | |
"duration": 0.011579, | |
"end_time": "2022-10-11T08:53:59.334963", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:59.323384", | |
"status": "completed" | |
}, | |
"tags": [], | |
"id": "ae91b8a7" | |
}, | |
"source": [ | |
"# Predict on unseen data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"id": "1535bba9", | |
"metadata": { | |
"_kg_hide-input": true, | |
"execution": { | |
"iopub.execute_input": "2022-10-11T08:53:59.360994Z", | |
"iopub.status.busy": "2022-10-11T08:53:59.359944Z", | |
"iopub.status.idle": "2022-10-11T08:53:59.373189Z", | |
"shell.execute_reply": "2022-10-11T08:53:59.371934Z" | |
}, | |
"papermill": { | |
"duration": 0.028955, | |
"end_time": "2022-10-11T08:53:59.375703", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:59.346748", | |
"status": "completed" | |
}, | |
"tags": [], | |
"cellView": "form", | |
"id": "1535bba9" | |
}, | |
"outputs": [], | |
"source": [ | |
"# @title Helper functions\n", | |
"def save_predictions(predictions: pd.DataFrame, file_name: str) -> None:\n", | |
" \"\"\"Save predictions\"\"\"\n", | |
" # create folder for saving predictions\n", | |
" predictions_path = OUTPUT_DIR / \"predictions\"\n", | |
" if not predictions_path.exists():\n", | |
" predictions_path.mkdir()\n", | |
"\n", | |
" # save predictions\n", | |
" file = predictions_path / file_name\n", | |
" predictions.to_csv(file, index=False)\n", | |
"\n", | |
"\n", | |
"def load_models(run_id: str) -> list:\n", | |
" \"\"\"Load all models from a given run ID\"\"\"\n", | |
" # get the number of folds for this run\n", | |
" models_dir = OUTPUT_DIR / run_id\n", | |
" n_folds = len(list(models_dir.glob(\"*\")))\n", | |
"\n", | |
" # load models\n", | |
" models = []\n", | |
" for fold in range(n_folds):\n", | |
" models.append(joblib.load(models_dir / f\"model_{fold}\"))\n", | |
" return models\n", | |
"\n", | |
"def predict(run_id: str, proba=False, save_preds=True) -> None:\n", | |
" # load data\n", | |
" data_path = TEST_DATA\n", | |
" test_df = pd.read_csv(data_path, index_col=INDEX_COL)\n", | |
"\n", | |
" # load models\n", | |
" estimators = load_models(run_id)\n", | |
"\n", | |
" # obtain predictions\n", | |
" predictions = None\n", | |
" for estimator in estimators:\n", | |
" test_preds = estimator.predict_proba(test_df)\n", | |
"\n", | |
" if predictions is None:\n", | |
" predictions = test_preds\n", | |
" else:\n", | |
" predictions += test_preds\n", | |
"\n", | |
" # average predictions and create a dataframe\n", | |
" predictions /= len(estimators)\n", | |
" predictions_df = pd.DataFrame(\n", | |
" predictions, columns=estimator.classes_, index=test_df.index\n", | |
" )\n", | |
"\n", | |
" # format predictions depending on whether we want probabilities or classes\n", | |
" if proba:\n", | |
" # get the probability of the positive class for binary classification\n", | |
" if predictions_df.shape[1] == 2:\n", | |
" predictions_df = predictions_df.iloc[:, 1].rename(TARGET_COL)\n", | |
" else:\n", | |
" predictions_df = predictions_df.idxmax(axis=1).rename(TARGET_COL)\n", | |
"\n", | |
" # reset the index\n", | |
" predictions_df = predictions_df.reset_index()\n", | |
"\n", | |
" # save predictions\n", | |
" if save_preds:\n", | |
" save_predictions(predictions_df, f\"{run_id}_{data_path.stem}.csv\")\n", | |
"\n", | |
" # return predictions\n", | |
" return predictions_df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"id": "a4678750", | |
"metadata": { | |
"execution": { | |
"iopub.execute_input": "2022-10-11T08:53:59.400744Z", | |
"iopub.status.busy": "2022-10-11T08:53:59.400289Z", | |
"iopub.status.idle": "2022-10-11T08:54:02.001835Z", | |
"shell.execute_reply": "2022-10-11T08:54:02.000893Z" | |
}, | |
"papermill": { | |
"duration": 2.617337, | |
"end_time": "2022-10-11T08:54:02.004362", | |
"exception": false, | |
"start_time": "2022-10-11T08:53:59.387025", | |
"status": "completed" | |
}, | |
"tags": [], | |
"id": "a4678750", | |
"outputId": "670ce60f-63ce-4e26-85df-04e9ef79f0c8", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 961 | |
} | |
}, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/plain": [ | |
" index loan_status\n", | |
"0 0 0.220778\n", | |
"1 1 0.220778\n", | |
"2 2 0.220778\n", | |
"3 3 0.220778\n", | |
"4 4 0.220778" | |
], | |
"text/html": [ | |
"\n", | |
" <div id=\"df-a5f49af0-89cc-4a01-8cf4-0ff8e50420dc\">\n", | |
" <div class=\"colab-df-container\">\n", | |
" <div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>index</th>\n", | |
" <th>loan_status</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0</td>\n", | |
" <td>0.220778</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>0.220778</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2</td>\n", | |
" <td>0.220778</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>3</td>\n", | |
" <td>0.220778</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>4</td>\n", | |
" <td>0.220778</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>\n", | |
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-a5f49af0-89cc-4a01-8cf4-0ff8e50420dc')\"\n", | |
" title=\"Convert this dataframe to an interactive table.\"\n", | |
" style=\"display:none;\">\n", | |
" \n", | |
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", | |
" width=\"24px\">\n", | |
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n", | |
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n", | |
" </svg>\n", | |
" </button>\n", | |
" \n", | |
" <style>\n", | |
" .colab-df-container {\n", | |
" display:flex;\n", | |
" flex-wrap:wrap;\n", | |
" gap: 12px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert {\n", | |
" background-color: #E8F0FE;\n", | |
" border: none;\n", | |
" border-radius: 50%;\n", | |
" cursor: pointer;\n", | |
" display: none;\n", | |
" fill: #1967D2;\n", | |
" height: 32px;\n", | |
" padding: 0 0 0 0;\n", | |
" width: 32px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert:hover {\n", | |
" background-color: #E2EBFA;\n", | |
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
" fill: #174EA6;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert {\n", | |
" background-color: #3B4455;\n", | |
" fill: #D2E3FC;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert:hover {\n", | |
" background-color: #434B5C;\n", | |
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", | |
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", | |
" fill: #FFFFFF;\n", | |
" }\n", | |
" </style>\n", | |
"\n", | |
" <script>\n", | |
" const buttonEl =\n", | |
" document.querySelector('#df-a5f49af0-89cc-4a01-8cf4-0ff8e50420dc button.colab-df-convert');\n", | |
" buttonEl.style.display =\n", | |
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
"\n", | |
" async function convertToInteractive(key) {\n", | |
" const element = document.querySelector('#df-a5f49af0-89cc-4a01-8cf4-0ff8e50420dc');\n", | |
" const dataTable =\n", | |
" await google.colab.kernel.invokeFunction('convertToInteractive',\n", | |
" [key], {});\n", | |
" if (!dataTable) return;\n", | |
"\n", | |
" const docLinkHtml = 'Like what you see? Visit the ' +\n", | |
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", | |
" + ' to learn more about interactive tables.';\n", | |
" element.innerHTML = '';\n", | |
" dataTable['output_type'] = 'display_data';\n", | |
" await google.colab.output.renderOutput(dataTable, element);\n", | |
" const docLink = document.createElement('div');\n", | |
" docLink.innerHTML = docLinkHtml;\n", | |
" element.appendChild(docLink);\n", | |
" }\n", | |
" </script>\n", | |
" </div>\n", | |
" </div>\n", | |
" " | |
] | |
}, | |
"metadata": {} | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/plain": [ | |
" index loan_status\n", | |
"0 0 0.953789\n", | |
"1 1 0.975890\n", | |
"2 2 0.907941\n", | |
"3 3 0.964061\n", | |
"4 4 0.985547" | |
], | |
"text/html": [ | |
"\n", | |
" <div id=\"df-83828fc6-d179-44a6-accd-3bf88b17f87a\">\n", | |
" <div class=\"colab-df-container\">\n", | |
" <div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>index</th>\n", | |
" <th>loan_status</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0</td>\n", | |
" <td>0.953789</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>0.975890</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2</td>\n", | |
" <td>0.907941</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>3</td>\n", | |
" <td>0.964061</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>4</td>\n", | |
" <td>0.985547</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>\n", | |
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-83828fc6-d179-44a6-accd-3bf88b17f87a')\"\n", | |
" title=\"Convert this dataframe to an interactive table.\"\n", | |
" style=\"display:none;\">\n", | |
" \n", | |
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", | |
" width=\"24px\">\n", | |
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n", | |
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n", | |
" </svg>\n", | |
" </button>\n", | |
" \n", | |
" <style>\n", | |
" .colab-df-container {\n", | |
" display:flex;\n", | |
" flex-wrap:wrap;\n", | |
" gap: 12px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert {\n", | |
" background-color: #E8F0FE;\n", | |
" border: none;\n", | |
" border-radius: 50%;\n", | |
" cursor: pointer;\n", | |
" display: none;\n", | |
" fill: #1967D2;\n", | |
" height: 32px;\n", | |
" padding: 0 0 0 0;\n", | |
" width: 32px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert:hover {\n", | |
" background-color: #E2EBFA;\n", | |
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
" fill: #174EA6;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert {\n", | |
" background-color: #3B4455;\n", | |
" fill: #D2E3FC;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert:hover {\n", | |
" background-color: #434B5C;\n", | |
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", | |
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", | |
" fill: #FFFFFF;\n", | |
" }\n", | |
" </style>\n", | |
"\n", | |
" <script>\n", | |
" const buttonEl =\n", | |
" document.querySelector('#df-83828fc6-d179-44a6-accd-3bf88b17f87a button.colab-df-convert');\n", | |
" buttonEl.style.display =\n", | |
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
"\n", | |
" async function convertToInteractive(key) {\n", | |
" const element = document.querySelector('#df-83828fc6-d179-44a6-accd-3bf88b17f87a');\n", | |
" const dataTable =\n", | |
" await google.colab.kernel.invokeFunction('convertToInteractive',\n", | |
" [key], {});\n", | |
" if (!dataTable) return;\n", | |
"\n", | |
" const docLinkHtml = 'Like what you see? Visit the ' +\n", | |
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", | |
" + ' to learn more about interactive tables.';\n", | |
" element.innerHTML = '';\n", | |
" dataTable['output_type'] = 'display_data';\n", | |
" await google.colab.output.renderOutput(dataTable, element);\n", | |
" const docLink = document.createElement('div');\n", | |
" docLink.innerHTML = docLinkHtml;\n", | |
" element.appendChild(docLink);\n", | |
" }\n", | |
" </script>\n", | |
" </div>\n", | |
" </div>\n", | |
" " | |
] | |
}, | |
"metadata": {} | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/plain": [ | |
" index loan_status\n", | |
"0 0 1.0\n", | |
"1 1 1.0\n", | |
"2 2 1.0\n", | |
"3 3 1.0\n", | |
"4 4 1.0" | |
], | |
"text/html": [ | |
"\n", | |
" <div id=\"df-2d731a82-52fe-47fe-bbcb-dd3734a32573\">\n", | |
" <div class=\"colab-df-container\">\n", | |
" <div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>index</th>\n", | |
" <th>loan_status</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0</td>\n", | |
" <td>1.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>1.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2</td>\n", | |
" <td>1.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>3</td>\n", | |
" <td>1.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>4</td>\n", | |
" <td>1.0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>\n", | |
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-2d731a82-52fe-47fe-bbcb-dd3734a32573')\"\n", | |
" title=\"Convert this dataframe to an interactive table.\"\n", | |
" style=\"display:none;\">\n", | |
" \n", | |
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", | |
" width=\"24px\">\n", | |
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n", | |
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n", | |
" </svg>\n", | |
" </button>\n", | |
" \n", | |
" <style>\n", | |
" .colab-df-container {\n", | |
" display:flex;\n", | |
" flex-wrap:wrap;\n", | |
" gap: 12px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert {\n", | |
" background-color: #E8F0FE;\n", | |
" border: none;\n", | |
" border-radius: 50%;\n", | |
" cursor: pointer;\n", | |
" display: none;\n", | |
" fill: #1967D2;\n", | |
" height: 32px;\n", | |
" padding: 0 0 0 0;\n", | |
" width: 32px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert:hover {\n", | |
" background-color: #E2EBFA;\n", | |
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
" fill: #174EA6;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert {\n", | |
" background-color: #3B4455;\n", | |
" fill: #D2E3FC;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert:hover {\n", | |
" background-color: #434B5C;\n", | |
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", | |
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", | |
" fill: #FFFFFF;\n", | |
" }\n", | |
" </style>\n", | |
"\n", | |
" <script>\n", | |
" const buttonEl =\n", | |
" document.querySelector('#df-2d731a82-52fe-47fe-bbcb-dd3734a32573 button.colab-df-convert');\n", | |
" buttonEl.style.display =\n", | |
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
"\n", | |
" async function convertToInteractive(key) {\n", | |
" const element = document.querySelector('#df-2d731a82-52fe-47fe-bbcb-dd3734a32573');\n", | |
" const dataTable =\n", | |
" await google.colab.kernel.invokeFunction('convertToInteractive',\n", | |
" [key], {});\n", | |
" if (!dataTable) return;\n", | |
"\n", | |
" const docLinkHtml = 'Like what you see? Visit the ' +\n", | |
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", | |
" + ' to learn more about interactive tables.';\n", | |
" element.innerHTML = '';\n", | |
" dataTable['output_type'] = 'display_data';\n", | |
" await google.colab.output.renderOutput(dataTable, element);\n", | |
" const docLink = document.createElement('div');\n", | |
" docLink.innerHTML = docLinkHtml;\n", | |
" element.appendChild(docLink);\n", | |
" }\n", | |
" </script>\n", | |
" </div>\n", | |
" </div>\n", | |
" " | |
] | |
}, | |
"metadata": {} | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/plain": [ | |
" index loan_status\n", | |
"0 0 0.914\n", | |
"1 1 0.858\n", | |
"2 2 0.844\n", | |
"3 3 0.676\n", | |
"4 4 0.774" | |
], | |
"text/html": [ | |
"\n", | |
" <div id=\"df-5f955f2b-5eff-41c6-b6be-e5eadde22e2a\">\n", | |
" <div class=\"colab-df-container\">\n", | |
" <div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>index</th>\n", | |
" <th>loan_status</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0</td>\n", | |
" <td>0.914</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>0.858</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2</td>\n", | |
" <td>0.844</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>3</td>\n", | |
" <td>0.676</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>4</td>\n", | |
" <td>0.774</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>\n", | |
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-5f955f2b-5eff-41c6-b6be-e5eadde22e2a')\"\n", | |
" title=\"Convert this dataframe to an interactive table.\"\n", | |
" style=\"display:none;\">\n", | |
" \n", | |
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", | |
" width=\"24px\">\n", | |
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n", | |
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n", | |
" </svg>\n", | |
" </button>\n", | |
" \n", | |
" <style>\n", | |
" .colab-df-container {\n", | |
" display:flex;\n", | |
" flex-wrap:wrap;\n", | |
" gap: 12px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert {\n", | |
" background-color: #E8F0FE;\n", | |
" border: none;\n", | |
" border-radius: 50%;\n", | |
" cursor: pointer;\n", | |
" display: none;\n", | |
" fill: #1967D2;\n", | |
" height: 32px;\n", | |
" padding: 0 0 0 0;\n", | |
" width: 32px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert:hover {\n", | |
" background-color: #E2EBFA;\n", | |
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
" fill: #174EA6;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert {\n", | |
" background-color: #3B4455;\n", | |
" fill: #D2E3FC;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert:hover {\n", | |
" background-color: #434B5C;\n", | |
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", | |
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", | |
" fill: #FFFFFF;\n", | |
" }\n", | |
" </style>\n", | |
"\n", | |
" <script>\n", | |
" const buttonEl =\n", | |
" document.querySelector('#df-5f955f2b-5eff-41c6-b6be-e5eadde22e2a button.colab-df-convert');\n", | |
" buttonEl.style.display =\n", | |
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
"\n", | |
" async function convertToInteractive(key) {\n", | |
" const element = document.querySelector('#df-5f955f2b-5eff-41c6-b6be-e5eadde22e2a');\n", | |
" const dataTable =\n", | |
" await google.colab.kernel.invokeFunction('convertToInteractive',\n", | |
" [key], {});\n", | |
" if (!dataTable) return;\n", | |
"\n", | |
" const docLinkHtml = 'Like what you see? Visit the ' +\n", | |
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", | |
" + ' to learn more about interactive tables.';\n", | |
" element.innerHTML = '';\n", | |
" dataTable['output_type'] = 'display_data';\n", | |
" await google.colab.output.renderOutput(dataTable, element);\n", | |
" const docLink = document.createElement('div');\n", | |
" docLink.innerHTML = docLinkHtml;\n", | |
" element.appendChild(docLink);\n", | |
" }\n", | |
" </script>\n", | |
" </div>\n", | |
" </div>\n", | |
" " | |
] | |
}, | |
"metadata": {} | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/plain": [ | |
" index loan_status\n", | |
"0 0 0.993541\n", | |
"1 1 0.994858\n", | |
"2 2 0.966449\n", | |
"3 3 0.511306\n", | |
"4 4 0.937949" | |
], | |
"text/html": [ | |
"\n", | |
" <div id=\"df-d0ae6785-77b4-4a08-8b38-34ab94438d31\">\n", | |
" <div class=\"colab-df-container\">\n", | |
" <div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>index</th>\n", | |
" <th>loan_status</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>0</td>\n", | |
" <td>0.993541</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>0.994858</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2</td>\n", | |
" <td>0.966449</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>3</td>\n", | |
" <td>0.511306</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>4</td>\n", | |
" <td>0.937949</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>\n", | |
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-d0ae6785-77b4-4a08-8b38-34ab94438d31')\"\n", | |
" title=\"Convert this dataframe to an interactive table.\"\n", | |
" style=\"display:none;\">\n", | |
" \n", | |
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", | |
" width=\"24px\">\n", | |
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n", | |
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n", | |
" </svg>\n", | |
" </button>\n", | |
" \n", | |
" <style>\n", | |
" .colab-df-container {\n", | |
" display:flex;\n", | |
" flex-wrap:wrap;\n", | |
" gap: 12px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert {\n", | |
" background-color: #E8F0FE;\n", | |
" border: none;\n", | |
" border-radius: 50%;\n", | |
" cursor: pointer;\n", | |
" display: none;\n", | |
" fill: #1967D2;\n", | |
" height: 32px;\n", | |
" padding: 0 0 0 0;\n", | |
" width: 32px;\n", | |
" }\n", | |
"\n", | |
" .colab-df-convert:hover {\n", | |
" background-color: #E2EBFA;\n", | |
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
" fill: #174EA6;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert {\n", | |
" background-color: #3B4455;\n", | |
" fill: #D2E3FC;\n", | |
" }\n", | |
"\n", | |
" [theme=dark] .colab-df-convert:hover {\n", | |
" background-color: #434B5C;\n", | |
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", | |
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", | |
" fill: #FFFFFF;\n", | |
" }\n", | |
" </style>\n", | |
"\n", | |
" <script>\n", | |
" const buttonEl =\n", | |
" document.querySelector('#df-d0ae6785-77b4-4a08-8b38-34ab94438d31 button.colab-df-convert');\n", | |
" buttonEl.style.display =\n", | |
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
"\n", | |
" async function convertToInteractive(key) {\n", | |
" const element = document.querySelector('#df-d0ae6785-77b4-4a08-8b38-34ab94438d31');\n", | |
" const dataTable =\n", | |
" await google.colab.kernel.invokeFunction('convertToInteractive',\n", | |
" [key], {});\n", | |
" if (!dataTable) return;\n", | |
"\n", | |
" const docLinkHtml = 'Like what you see? Visit the ' +\n", | |
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", | |
" + ' to learn more about interactive tables.';\n", | |
" element.innerHTML = '';\n", | |
" dataTable['output_type'] = 'display_data';\n", | |
" await google.colab.output.renderOutput(dataTable, element);\n", | |
" const docLink = document.createElement('div');\n", | |
" docLink.innerHTML = docLinkHtml;\n", | |
" element.appendChild(docLink);\n", | |
" }\n", | |
" </script>\n", | |
" </div>\n", | |
" </div>\n", | |
" " | |
] | |
}, | |
"metadata": {} | |
} | |
], | |
"source": [ | |
"# select whether you want probabilities or classes for predictions\n", | |
"# using this boolean flag\n", | |
"proba = True\n", | |
"\n", | |
"# obtain predictions\n", | |
"dc_preds = predict(dc_run_id, proba=proba)\n", | |
"display(dc_preds.head())\n", | |
"\n", | |
"lr_preds = predict(lr_run_id, proba=proba)\n", | |
"display(lr_preds.head())\n", | |
"\n", | |
"dt_preds = predict(dt_run_id, proba=proba)\n", | |
"display(dt_preds.head())\n", | |
"\n", | |
"rf_preds = predict(rf_run_id, proba=proba)\n", | |
"display(rf_preds.head())\n", | |
"\n", | |
"hgb_preds = predict(hgb_run_id, proba=proba)\n", | |
"display(hgb_preds.head())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"id": "1110c9f9", | |
"metadata": { | |
"papermill": { | |
"duration": 0.012878, | |
"end_time": "2022-10-11T08:54:02.030918", | |
"exception": false, | |
"start_time": "2022-10-11T08:54:02.018040", | |
"status": "completed" | |
}, | |
"tags": [], | |
"id": "1110c9f9" | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.12" | |
}, | |
"papermill": { | |
"default_parameters": {}, | |
"duration": 34.007435, | |
"end_time": "2022-10-11T08:54:04.663511", | |
"environment_variables": {}, | |
"exception": null, | |
"input_path": "__notebook__.ipynb", | |
"output_path": "__notebook__.ipynb", | |
"parameters": {}, | |
"start_time": "2022-10-11T08:53:30.656076", | |
"version": "2.3.4" | |
}, | |
"colab": { | |
"provenance": [], | |
"collapsed_sections": [], | |
"include_colab_link": true | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment