Last active
March 28, 2022 17:31
-
-
Save noahjgreen295/4787ea99e14f48d2ffc9606354c3b5b5 to your computer and use it in GitHub Desktop.
Sample Notebook for automatic feature dropping
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "21412476", | |
"metadata": {}, | |
"source": [ | |
"# Sample Feature Drop " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "26352253", | |
"metadata": {}, | |
"source": [ | |
"## Imports" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "4d60ac5e", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"\n", | |
"from sklearn.compose import ColumnTransformer\n", | |
"from sklearn.pipeline import Pipeline\n", | |
"from sklearn.preprocessing import OneHotEncoder as OneHotEncoderSklearn\n", | |
"from sklearn.preprocessing import StandardScaler\n", | |
"from sklearn.linear_model import LinearRegression\n", | |
"\n", | |
"from feature_engine.encoding import OneHotEncoder as OneHotEncoderFeatureEngine\n", | |
"from feature_engine.wrappers import SklearnTransformerWrapper\n", | |
"from feature_engine.selection import DropFeatures" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "9c0771ec", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "8ccac235", | |
"metadata": {}, | |
"source": [ | |
"## Helper functions" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "a455335e", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def try_fit(pipeline: Pipeline, X, y):\n", | |
" success: bool = True\n", | |
" try:\n", | |
" pipeline.fit(X, y)\n", | |
" except Exception as e:\n", | |
" success = False\n", | |
" return \"Fit succeeded\" if success else \"Failed\"\n", | |
" \n", | |
"def show_features_sk(pipeline: Pipeline):\n", | |
" return pipeline[\"transforms\"].get_feature_names_out().tolist()\n", | |
"\n", | |
"def show_features_fe(pipeline: Pipeline, X):\n", | |
" return pipeline[\"transforms\"].transform(X).columns.tolist()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "aeedf510", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "50112907", | |
"metadata": {}, | |
"source": [ | |
"## Setup Sample DataFrame" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "c64a57bf", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>x1</th>\n", | |
" <th>x2</th>\n", | |
" <th>x3</th>\n", | |
" <th>y</th>\n", | |
" <th>x4</th>\n", | |
" <th>x5</th>\n", | |
" <th>x6</th>\n", | |
" <th>x7</th>\n", | |
" <th>x8</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>a</td>\n", | |
" <td>g</td>\n", | |
" <td>18.415773</td>\n", | |
" <td>-1510.535719</td>\n", | |
" <td>h</td>\n", | |
" <td>c</td>\n", | |
" <td>f</td>\n", | |
" <td>i</td>\n", | |
" <td>f</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>b</td>\n", | |
" <td>g</td>\n", | |
" <td>91.830742</td>\n", | |
" <td>-1475.515643</td>\n", | |
" <td>f</td>\n", | |
" <td>h</td>\n", | |
" <td>i</td>\n", | |
" <td>j</td>\n", | |
" <td>d</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>a</td>\n", | |
" <td>b</td>\n", | |
" <td>46.133657</td>\n", | |
" <td>-1446.278123</td>\n", | |
" <td>f</td>\n", | |
" <td>b</td>\n", | |
" <td>a</td>\n", | |
" <td>b</td>\n", | |
" <td>i</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>a</td>\n", | |
" <td>c</td>\n", | |
" <td>71.856817</td>\n", | |
" <td>-1442.431217</td>\n", | |
" <td>i</td>\n", | |
" <td>j</td>\n", | |
" <td>i</td>\n", | |
" <td>c</td>\n", | |
" <td>d</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>b</td>\n", | |
" <td>d</td>\n", | |
" <td>62.890009</td>\n", | |
" <td>-1454.254753</td>\n", | |
" <td>c</td>\n", | |
" <td>d</td>\n", | |
" <td>g</td>\n", | |
" <td>i</td>\n", | |
" <td>e</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" x1 x2 x3 y x4 x5 x6 x7 x8\n", | |
"0 a g 18.415773 -1510.535719 h c f i f\n", | |
"1 b g 91.830742 -1475.515643 f h i j d\n", | |
"2 a b 46.133657 -1446.278123 f b a b i\n", | |
"3 a c 71.856817 -1442.431217 i j i c d\n", | |
"4 b d 62.890009 -1454.254753 c d g i e" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Sample DataFrame where y is a function of x1, x2, and x3.\n", | |
"df_sample = pd.DataFrame({\n", | |
" \"x1\": [\"a\", \"b\", \"a\", \"a\", \"b\"],\n", | |
" \"x2\": [\"g\", \"g\", \"b\", \"c\", \"d\"],\n", | |
" \"x3\": np.random.uniform(size=5)*100\n", | |
"})\n", | |
"df_sample = (\n", | |
" df_sample.assign(y=lambda x: x.x1.apply(ord)*-5 + x.x2.apply(ord)*-10 + x.x3*0.5)\n", | |
")\n", | |
"df_sample.y = df_sample.y + [np.random.normal(0, 0.5)*10 for i in range(len(df_sample))]\n", | |
"\n", | |
"# x4 thru x8 are 5 random categorical columns we don't care about\n", | |
"rand_vals = \"abcdefghij\"\n", | |
"for i in range(5):\n", | |
" df_sample[f\"x{i+4}\"] = [rand_vals[np.random.randint(len(rand_vals))] for j in range(len(df_sample))]\n", | |
"df_sample" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "25c78c4a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "51e46d28", | |
"metadata": {}, | |
"source": [ | |
"## X, y Split" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "8f596484", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>x1</th>\n", | |
" <th>x2</th>\n", | |
" <th>x3</th>\n", | |
" <th>x4</th>\n", | |
" <th>x5</th>\n", | |
" <th>x6</th>\n", | |
" <th>x7</th>\n", | |
" <th>x8</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>a</td>\n", | |
" <td>g</td>\n", | |
" <td>18.415773</td>\n", | |
" <td>h</td>\n", | |
" <td>c</td>\n", | |
" <td>f</td>\n", | |
" <td>i</td>\n", | |
" <td>f</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>b</td>\n", | |
" <td>g</td>\n", | |
" <td>91.830742</td>\n", | |
" <td>f</td>\n", | |
" <td>h</td>\n", | |
" <td>i</td>\n", | |
" <td>j</td>\n", | |
" <td>d</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>a</td>\n", | |
" <td>b</td>\n", | |
" <td>46.133657</td>\n", | |
" <td>f</td>\n", | |
" <td>b</td>\n", | |
" <td>a</td>\n", | |
" <td>b</td>\n", | |
" <td>i</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>a</td>\n", | |
" <td>c</td>\n", | |
" <td>71.856817</td>\n", | |
" <td>i</td>\n", | |
" <td>j</td>\n", | |
" <td>i</td>\n", | |
" <td>c</td>\n", | |
" <td>d</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>b</td>\n", | |
" <td>d</td>\n", | |
" <td>62.890009</td>\n", | |
" <td>c</td>\n", | |
" <td>d</td>\n", | |
" <td>g</td>\n", | |
" <td>i</td>\n", | |
" <td>e</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" x1 x2 x3 x4 x5 x6 x7 x8\n", | |
"0 a g 18.415773 h c f i f\n", | |
"1 b g 91.830742 f h i j d\n", | |
"2 a b 46.133657 f b a b i\n", | |
"3 a c 71.856817 i j i c d\n", | |
"4 b d 62.890009 c d g i e" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# We don't filter out x4 thru x8, idea is there could be many non-feature columns we are unaware of \n", | |
"X: pd.DataFrame = df_sample.drop(columns=\"y\")\n", | |
"y: pd.Series = df_sample[\"y\"]\n", | |
"X" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "77d24c3b", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "04de501f", | |
"metadata": {}, | |
"source": [ | |
"## Pipeline using sklearn ColumnTransformer\n", | |
"### Automatic ignoring of non-features and successful fit" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "efb56a98", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Pipeline using sklearn ColumnTransformer\n", | |
"pipeline_sk = Pipeline(steps=[\n", | |
" (\n", | |
" \"transforms\",\n", | |
" ColumnTransformer(\n", | |
" transformers=[\n", | |
" (\"features_cat\", OneHotEncoderSklearn(drop=\"first\"), [\"x1\", \"x2\"]),\n", | |
" (\"features_num\", StandardScaler(), [\"x3\"])\n", | |
" ],\n", | |
" \n", | |
" # Key: this automatically drops unspecified columns for us\n", | |
" remainder=\"drop\"\n", | |
" )\n", | |
" ),\n", | |
" (\"model\", LinearRegression())\n", | |
"])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "eb4805a9", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'Fit succeeded'" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Works without explicitly specifying ignore of x4 thru x8\n", | |
"try_fit(pipeline_sk, X, y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "c886fce8", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['features_cat__x1_b',\n", | |
" 'features_cat__x2_c',\n", | |
" 'features_cat__x2_d',\n", | |
" 'features_cat__x2_g',\n", | |
" 'features_num__x3']" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Features automatically filtered\n", | |
"show_features_sk(pipeline_sk)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "800fa29c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "c745ecaa", | |
"metadata": {}, | |
"source": [ | |
"## Pipeline using feature-engine transformers, no ColumnTransformer\n", | |
"### Fails because cannot automatically filter non-features, which make it through unencoded to model" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "495ff2f7", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# How to do with feature-engine, and not use ColumnTransformer?\n", | |
"# (We use nested Pipeline for transforms to make it easy to show features used)\n", | |
"pipeline_fe = Pipeline(steps=[\n", | |
" (\"transforms\", Pipeline(steps=[\n", | |
" (\"features_cat\", OneHotEncoderFeatureEngine(drop_last=True, variables=[\"x1\", \"x2\"])),\n", | |
" (\"features_num\", SklearnTransformerWrapper(transformer=StandardScaler(), variables=[\"x3\"]))\n", | |
" ])),\n", | |
" (\"model\", LinearRegression())\n", | |
"])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "6a7f0d65", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'Failed'" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Fails without specifying ignore of x4 thru 8\n", | |
"try_fit(pipeline_fe, X, y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "afcae912", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x1_a', 'x2_g', 'x2_b', 'x2_c']" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Because extra, unencoded features make it through\n", | |
"show_features_fe(pipeline_fe, X)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "06f521c1", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "67c0ed85", | |
"metadata": {}, | |
"source": [ | |
"## Must either explicitly drop beforehand..." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "909d259c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"X_cleaned = X.drop(columns=[\"x4\", \"x5\", \"x6\", \"x7\", \"x8\"])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "80ac8f4e", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'Fit succeeded'" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Fit succeeds\n", | |
"try_fit(pipeline_fe, X_cleaned, y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "88ead019", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['x3', 'x1_a', 'x2_g', 'x2_b', 'x2_c']" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Because unused features were explicitly dropped beforehand\n", | |
"show_features_fe(pipeline_fe, X_cleaned)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "53b62883", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "1cfb8252", | |
"metadata": {}, | |
"source": [ | |
"## ...Or explicitly use DropFeatures()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "cfb3cc30", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"pipeline_fe_2 = Pipeline(steps=[\n", | |
" (\"transforms\", Pipeline(steps=[\n", | |
" (\"features_cat\", OneHotEncoderFeatureEngine(drop_last=True, variables=[\"x1\", \"x2\"])),\n", | |
" (\"features_num\", SklearnTransformerWrapper(transformer=StandardScaler(), variables=[\"x3\"])),\n", | |
" (\"drop_unused\", DropFeatures(features_to_drop=[\"x4\", \"x5\", \"x6\", \"x7\", \"x8\"]))\n", | |
" ])),\n", | |
" (\"model\", LinearRegression())\n", | |
"])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "cd035171", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'Fit succeeded'" | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Fit succeesd\n", | |
"try_fit(pipeline_fe_2, X, y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"id": "cf1df2c1", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['x3', 'x1_a', 'x2_g', 'x2_b', 'x2_c']" | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Because extra, unencoded features were dropped explicitly in pipeline\n", | |
"show_features_fe(pipeline_fe_2, X)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "9b9d4bee", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "457c457e", | |
"metadata": {}, | |
"source": [ | |
"## Either of these approaches requires foreknowledge of which features to get rid of\n", | |
"## This could be difficult in a dynamic setting, particularly deployment " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "e8a6bc40", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "ef3b8e86", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.10" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment