Skip to content

Instantly share code, notes, and snippets.

@noahjgreen295
Last active March 28, 2022 17:31
Show Gist options
  • Save noahjgreen295/4787ea99e14f48d2ffc9606354c3b5b5 to your computer and use it in GitHub Desktop.
Save noahjgreen295/4787ea99e14f48d2ffc9606354c3b5b5 to your computer and use it in GitHub Desktop.
Sample Notebook for automatic feature dropping
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "21412476",
"metadata": {},
"source": [
"# Sample Feature Drop "
]
},
{
"cell_type": "markdown",
"id": "26352253",
"metadata": {},
"source": [
"## Imports"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "4d60ac5e",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.preprocessing import OneHotEncoder as OneHotEncoderSklearn\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.linear_model import LinearRegression\n",
"\n",
"from feature_engine.encoding import OneHotEncoder as OneHotEncoderFeatureEngine\n",
"from feature_engine.wrappers import SklearnTransformerWrapper\n",
"from feature_engine.selection import DropFeatures"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9c0771ec",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "8ccac235",
"metadata": {},
"source": [
"## Helper functions"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "a455335e",
"metadata": {},
"outputs": [],
"source": [
"def try_fit(pipeline: Pipeline, X, y):\n",
" success: bool = True\n",
" try:\n",
" pipeline.fit(X, y)\n",
" except Exception as e:\n",
" success = False\n",
" return \"Fit succeeded\" if success else \"Failed\"\n",
" \n",
"def show_features_sk(pipeline: Pipeline):\n",
" return pipeline[\"transforms\"].get_feature_names_out().tolist()\n",
"\n",
"def show_features_fe(pipeline: Pipeline, X):\n",
" return pipeline[\"transforms\"].transform(X).columns.tolist()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aeedf510",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "50112907",
"metadata": {},
"source": [
"## Setup Sample DataFrame"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "c64a57bf",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>x1</th>\n",
" <th>x2</th>\n",
" <th>x3</th>\n",
" <th>y</th>\n",
" <th>x4</th>\n",
" <th>x5</th>\n",
" <th>x6</th>\n",
" <th>x7</th>\n",
" <th>x8</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>a</td>\n",
" <td>g</td>\n",
" <td>18.415773</td>\n",
" <td>-1510.535719</td>\n",
" <td>h</td>\n",
" <td>c</td>\n",
" <td>f</td>\n",
" <td>i</td>\n",
" <td>f</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>b</td>\n",
" <td>g</td>\n",
" <td>91.830742</td>\n",
" <td>-1475.515643</td>\n",
" <td>f</td>\n",
" <td>h</td>\n",
" <td>i</td>\n",
" <td>j</td>\n",
" <td>d</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>a</td>\n",
" <td>b</td>\n",
" <td>46.133657</td>\n",
" <td>-1446.278123</td>\n",
" <td>f</td>\n",
" <td>b</td>\n",
" <td>a</td>\n",
" <td>b</td>\n",
" <td>i</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>a</td>\n",
" <td>c</td>\n",
" <td>71.856817</td>\n",
" <td>-1442.431217</td>\n",
" <td>i</td>\n",
" <td>j</td>\n",
" <td>i</td>\n",
" <td>c</td>\n",
" <td>d</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>b</td>\n",
" <td>d</td>\n",
" <td>62.890009</td>\n",
" <td>-1454.254753</td>\n",
" <td>c</td>\n",
" <td>d</td>\n",
" <td>g</td>\n",
" <td>i</td>\n",
" <td>e</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" x1 x2 x3 y x4 x5 x6 x7 x8\n",
"0 a g 18.415773 -1510.535719 h c f i f\n",
"1 b g 91.830742 -1475.515643 f h i j d\n",
"2 a b 46.133657 -1446.278123 f b a b i\n",
"3 a c 71.856817 -1442.431217 i j i c d\n",
"4 b d 62.890009 -1454.254753 c d g i e"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Sample DataFrame where y is a function of x1, x2, and x3.\n",
"df_sample = pd.DataFrame({\n",
" \"x1\": [\"a\", \"b\", \"a\", \"a\", \"b\"],\n",
" \"x2\": [\"g\", \"g\", \"b\", \"c\", \"d\"],\n",
" \"x3\": np.random.uniform(size=5)*100\n",
"})\n",
"df_sample = (\n",
" df_sample.assign(y=lambda x: x.x1.apply(ord)*-5 + x.x2.apply(ord)*-10 + x.x3*0.5)\n",
")\n",
"df_sample.y = df_sample.y + [np.random.normal(0, 0.5)*10 for i in range(len(df_sample))]\n",
"\n",
"# x4 thru x8 are 5 random categorical columns we don't care about\n",
"rand_vals = \"abcdefghij\"\n",
"for i in range(5):\n",
" df_sample[f\"x{i+4}\"] = [rand_vals[np.random.randint(len(rand_vals))] for j in range(len(df_sample))]\n",
"df_sample"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "25c78c4a",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "51e46d28",
"metadata": {},
"source": [
"## X, y Split"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "8f596484",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>x1</th>\n",
" <th>x2</th>\n",
" <th>x3</th>\n",
" <th>x4</th>\n",
" <th>x5</th>\n",
" <th>x6</th>\n",
" <th>x7</th>\n",
" <th>x8</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>a</td>\n",
" <td>g</td>\n",
" <td>18.415773</td>\n",
" <td>h</td>\n",
" <td>c</td>\n",
" <td>f</td>\n",
" <td>i</td>\n",
" <td>f</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>b</td>\n",
" <td>g</td>\n",
" <td>91.830742</td>\n",
" <td>f</td>\n",
" <td>h</td>\n",
" <td>i</td>\n",
" <td>j</td>\n",
" <td>d</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>a</td>\n",
" <td>b</td>\n",
" <td>46.133657</td>\n",
" <td>f</td>\n",
" <td>b</td>\n",
" <td>a</td>\n",
" <td>b</td>\n",
" <td>i</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>a</td>\n",
" <td>c</td>\n",
" <td>71.856817</td>\n",
" <td>i</td>\n",
" <td>j</td>\n",
" <td>i</td>\n",
" <td>c</td>\n",
" <td>d</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>b</td>\n",
" <td>d</td>\n",
" <td>62.890009</td>\n",
" <td>c</td>\n",
" <td>d</td>\n",
" <td>g</td>\n",
" <td>i</td>\n",
" <td>e</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" x1 x2 x3 x4 x5 x6 x7 x8\n",
"0 a g 18.415773 h c f i f\n",
"1 b g 91.830742 f h i j d\n",
"2 a b 46.133657 f b a b i\n",
"3 a c 71.856817 i j i c d\n",
"4 b d 62.890009 c d g i e"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# We don't filter out x4 thru x8, idea is there could be many non-feature columns we are unaware of \n",
"X: pd.DataFrame = df_sample.drop(columns=\"y\")\n",
"y: pd.Series = df_sample[\"y\"]\n",
"X"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "77d24c3b",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "04de501f",
"metadata": {},
"source": [
"## Pipeline using sklearn ColumnTransformer\n",
"### Automatic ignoring of non-features and successful fit"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "efb56a98",
"metadata": {},
"outputs": [],
"source": [
"# Pipeline using sklearn ColumnTransformer\n",
"pipeline_sk = Pipeline(steps=[\n",
" (\n",
" \"transforms\",\n",
" ColumnTransformer(\n",
" transformers=[\n",
" (\"features_cat\", OneHotEncoderSklearn(drop=\"first\"), [\"x1\", \"x2\"]),\n",
" (\"features_num\", StandardScaler(), [\"x3\"])\n",
" ],\n",
" \n",
" # Key: this automatically drops unspecified columns for us\n",
" remainder=\"drop\"\n",
" )\n",
" ),\n",
" (\"model\", LinearRegression())\n",
"])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "eb4805a9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Fit succeeded'"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Works without explicitly specifying ignore of x4 thru x8\n",
"try_fit(pipeline_sk, X, y)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "c886fce8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['features_cat__x1_b',\n",
" 'features_cat__x2_c',\n",
" 'features_cat__x2_d',\n",
" 'features_cat__x2_g',\n",
" 'features_num__x3']"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Features automatically filtered\n",
"show_features_sk(pipeline_sk)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "800fa29c",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "c745ecaa",
"metadata": {},
"source": [
"## Pipeline using feature-engine transformers, no ColumnTransformer\n",
"### Fails because cannot automatically filter non-features, which make it through unencoded to model"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "495ff2f7",
"metadata": {},
"outputs": [],
"source": [
"# How to do with feature-engine, and not use ColumnTransformer?\n",
"# (We use nested Pipeline for transforms to make it easy to show features used)\n",
"pipeline_fe = Pipeline(steps=[\n",
" (\"transforms\", Pipeline(steps=[\n",
" (\"features_cat\", OneHotEncoderFeatureEngine(drop_last=True, variables=[\"x1\", \"x2\"])),\n",
" (\"features_num\", SklearnTransformerWrapper(transformer=StandardScaler(), variables=[\"x3\"]))\n",
" ])),\n",
" (\"model\", LinearRegression())\n",
"])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "6a7f0d65",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Failed'"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Fails without specifying ignore of x4 thru 8\n",
"try_fit(pipeline_fe, X, y)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "afcae912",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x1_a', 'x2_g', 'x2_b', 'x2_c']"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Because extra, unencoded features make it through\n",
"show_features_fe(pipeline_fe, X)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "06f521c1",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "67c0ed85",
"metadata": {},
"source": [
"## Must either explicitly drop beforehand..."
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "909d259c",
"metadata": {},
"outputs": [],
"source": [
"X_cleaned = X.drop(columns=[\"x4\", \"x5\", \"x6\", \"x7\", \"x8\"])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "80ac8f4e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Fit succeeded'"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Fit succeeds\n",
"try_fit(pipeline_fe, X_cleaned, y)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "88ead019",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['x3', 'x1_a', 'x2_g', 'x2_b', 'x2_c']"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Because unused features were explicitly dropped beforehand\n",
"show_features_fe(pipeline_fe, X_cleaned)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "53b62883",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "1cfb8252",
"metadata": {},
"source": [
"## ...Or explicitly use DropFeatures()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "cfb3cc30",
"metadata": {},
"outputs": [],
"source": [
"pipeline_fe_2 = Pipeline(steps=[\n",
" (\"transforms\", Pipeline(steps=[\n",
" (\"features_cat\", OneHotEncoderFeatureEngine(drop_last=True, variables=[\"x1\", \"x2\"])),\n",
" (\"features_num\", SklearnTransformerWrapper(transformer=StandardScaler(), variables=[\"x3\"])),\n",
" (\"drop_unused\", DropFeatures(features_to_drop=[\"x4\", \"x5\", \"x6\", \"x7\", \"x8\"]))\n",
" ])),\n",
" (\"model\", LinearRegression())\n",
"])"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "cd035171",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Fit succeeded'"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Fit succeesd\n",
"try_fit(pipeline_fe_2, X, y)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "cf1df2c1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['x3', 'x1_a', 'x2_g', 'x2_b', 'x2_c']"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Because extra, unencoded features were dropped explicitly in pipeline\n",
"show_features_fe(pipeline_fe_2, X)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9b9d4bee",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "457c457e",
"metadata": {},
"source": [
"## Either of these approaches requires foreknowledge of which features to get rid of\n",
"## This could be difficult in a dynamic setting, particularly deployment "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e8a6bc40",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "ef3b8e86",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment