noahjgreen295/sample_feature_drop_notebook.ipynb

## sample_feature_drop_notebook.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "21412476",
   "metadata": {},
   "source": [
    "# Sample Feature Drop "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "26352253",
   "metadata": {},
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "4d60ac5e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.preprocessing import OneHotEncoder as OneHotEncoderSklearn\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.linear_model import LinearRegression\n",
    "\n",
    "from feature_engine.encoding import OneHotEncoder as OneHotEncoderFeatureEngine\n",
    "from feature_engine.wrappers import SklearnTransformerWrapper\n",
    "from feature_engine.selection import DropFeatures"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9c0771ec",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "8ccac235",
   "metadata": {},
   "source": [
    "## Helper functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "a455335e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def try_fit(pipeline: Pipeline, X, y):\n",
    "    success: bool = True\n",
    "    try:\n",
    "        pipeline.fit(X, y)\n",
    "    except Exception as e:\n",
    "        success = False\n",
    "    return \"Fit succeeded\" if success else \"Failed\"\n",
    "    \n",
    "def show_features_sk(pipeline: Pipeline):\n",
    "    return pipeline[\"transforms\"].get_feature_names_out().tolist()\n",
    "\n",
    "def show_features_fe(pipeline: Pipeline, X):\n",
    "    return pipeline[\"transforms\"].transform(X).columns.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aeedf510",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "50112907",
   "metadata": {},
   "source": [
    "## Setup Sample DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c64a57bf",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>x1</th>\n",
       "      <th>x2</th>\n",
       "      <th>x3</th>\n",
       "      <th>y</th>\n",
       "      <th>x4</th>\n",
       "      <th>x5</th>\n",
       "      <th>x6</th>\n",
       "      <th>x7</th>\n",
       "      <th>x8</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>a</td>\n",
       "      <td>g</td>\n",
       "      <td>18.415773</td>\n",
       "      <td>-1510.535719</td>\n",
       "      <td>h</td>\n",
       "      <td>c</td>\n",
       "      <td>f</td>\n",
       "      <td>i</td>\n",
       "      <td>f</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>b</td>\n",
       "      <td>g</td>\n",
       "      <td>91.830742</td>\n",
       "      <td>-1475.515643</td>\n",
       "      <td>f</td>\n",
       "      <td>h</td>\n",
       "      <td>i</td>\n",
       "      <td>j</td>\n",
       "      <td>d</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>a</td>\n",
       "      <td>b</td>\n",
       "      <td>46.133657</td>\n",
       "      <td>-1446.278123</td>\n",
       "      <td>f</td>\n",
       "      <td>b</td>\n",
       "      <td>a</td>\n",
       "      <td>b</td>\n",
       "      <td>i</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>a</td>\n",
       "      <td>c</td>\n",
       "      <td>71.856817</td>\n",
       "      <td>-1442.431217</td>\n",
       "      <td>i</td>\n",
       "      <td>j</td>\n",
       "      <td>i</td>\n",
       "      <td>c</td>\n",
       "      <td>d</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>b</td>\n",
       "      <td>d</td>\n",
       "      <td>62.890009</td>\n",
       "      <td>-1454.254753</td>\n",
       "      <td>c</td>\n",
       "      <td>d</td>\n",
       "      <td>g</td>\n",
       "      <td>i</td>\n",
       "      <td>e</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  x1 x2         x3            y x4 x5 x6 x7 x8\n",
       "0  a  g  18.415773 -1510.535719  h  c  f  i  f\n",
       "1  b  g  91.830742 -1475.515643  f  h  i  j  d\n",
       "2  a  b  46.133657 -1446.278123  f  b  a  b  i\n",
       "3  a  c  71.856817 -1442.431217  i  j  i  c  d\n",
       "4  b  d  62.890009 -1454.254753  c  d  g  i  e"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Sample DataFrame where y is a function of x1, x2, and x3.\n",
    "df_sample = pd.DataFrame({\n",
    "    \"x1\": [\"a\", \"b\", \"a\", \"a\", \"b\"],\n",
    "    \"x2\": [\"g\", \"g\", \"b\", \"c\", \"d\"],\n",
    "    \"x3\": np.random.uniform(size=5)*100\n",
    "})\n",
    "df_sample = (\n",
    "    df_sample.assign(y=lambda x: x.x1.apply(ord)*-5 + x.x2.apply(ord)*-10 + x.x3*0.5)\n",
    ")\n",
    "df_sample.y = df_sample.y + [np.random.normal(0, 0.5)*10 for i in range(len(df_sample))]\n",
    "\n",
    "# x4 thru x8 are 5 random categorical columns we don't care about\n",
    "rand_vals = \"abcdefghij\"\n",
    "for i in range(5):\n",
    "    df_sample[f\"x{i+4}\"] = [rand_vals[np.random.randint(len(rand_vals))] for j in range(len(df_sample))]\n",
    "df_sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "25c78c4a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "51e46d28",
   "metadata": {},
   "source": [
    "## X, y Split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "8f596484",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>x1</th>\n",
       "      <th>x2</th>\n",
       "      <th>x3</th>\n",
       "      <th>x4</th>\n",
       "      <th>x5</th>\n",
       "      <th>x6</th>\n",
       "      <th>x7</th>\n",
       "      <th>x8</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>a</td>\n",
       "      <td>g</td>\n",
       "      <td>18.415773</td>\n",
       "      <td>h</td>\n",
       "      <td>c</td>\n",
       "      <td>f</td>\n",
       "      <td>i</td>\n",
       "      <td>f</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>b</td>\n",
       "      <td>g</td>\n",
       "      <td>91.830742</td>\n",
       "      <td>f</td>\n",
       "      <td>h</td>\n",
       "      <td>i</td>\n",
       "      <td>j</td>\n",
       "      <td>d</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>a</td>\n",
       "      <td>b</td>\n",
       "      <td>46.133657</td>\n",
       "      <td>f</td>\n",
       "      <td>b</td>\n",
       "      <td>a</td>\n",
       "      <td>b</td>\n",
       "      <td>i</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>a</td>\n",
       "      <td>c</td>\n",
       "      <td>71.856817</td>\n",
       "      <td>i</td>\n",
       "      <td>j</td>\n",
       "      <td>i</td>\n",
       "      <td>c</td>\n",
       "      <td>d</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>b</td>\n",
       "      <td>d</td>\n",
       "      <td>62.890009</td>\n",
       "      <td>c</td>\n",
       "      <td>d</td>\n",
       "      <td>g</td>\n",
       "      <td>i</td>\n",
       "      <td>e</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  x1 x2         x3 x4 x5 x6 x7 x8\n",
       "0  a  g  18.415773  h  c  f  i  f\n",
       "1  b  g  91.830742  f  h  i  j  d\n",
       "2  a  b  46.133657  f  b  a  b  i\n",
       "3  a  c  71.856817  i  j  i  c  d\n",
       "4  b  d  62.890009  c  d  g  i  e"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# We don't filter out x4 thru x8, idea is there could be many non-feature columns we are unaware of \n",
    "X: pd.DataFrame = df_sample.drop(columns=\"y\")\n",
    "y: pd.Series  = df_sample[\"y\"]\n",
    "X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "77d24c3b",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "04de501f",
   "metadata": {},
   "source": [
    "## Pipeline using sklearn ColumnTransformer\n",
    "### Automatic ignoring of non-features and successful fit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "efb56a98",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Pipeline using sklearn ColumnTransformer\n",
    "pipeline_sk = Pipeline(steps=[\n",
    "    (\n",
    "        \"transforms\",\n",
    "        ColumnTransformer(\n",
    "            transformers=[\n",
    "                (\"features_cat\", OneHotEncoderSklearn(drop=\"first\"), [\"x1\", \"x2\"]),\n",
    "                (\"features_num\", StandardScaler(), [\"x3\"])\n",
    "            ],\n",
    "            \n",
    "            # Key: this automatically drops unspecified columns for us\n",
    "            remainder=\"drop\"\n",
    "        )\n",
    "    ),\n",
    "    (\"model\", LinearRegression())\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "eb4805a9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Fit succeeded'"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Works without explicitly specifying ignore of x4 thru x8\n",
    "try_fit(pipeline_sk, X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "c886fce8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['features_cat__x1_b',\n",
       " 'features_cat__x2_c',\n",
       " 'features_cat__x2_d',\n",
       " 'features_cat__x2_g',\n",
       " 'features_num__x3']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Features automatically filtered\n",
    "show_features_sk(pipeline_sk)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "800fa29c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "c745ecaa",
   "metadata": {},
   "source": [
    "## Pipeline using feature-engine transformers, no ColumnTransformer\n",
    "### Fails because cannot automatically filter non-features, which make it through unencoded to model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "495ff2f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# How to do with feature-engine, and not use ColumnTransformer?\n",
    "# (We use nested Pipeline for transforms to make it easy to show features used)\n",
    "pipeline_fe = Pipeline(steps=[\n",
    "    (\"transforms\", Pipeline(steps=[\n",
    "        (\"features_cat\", OneHotEncoderFeatureEngine(drop_last=True, variables=[\"x1\", \"x2\"])),\n",
    "        (\"features_num\", SklearnTransformerWrapper(transformer=StandardScaler(), variables=[\"x3\"]))\n",
    "    ])),\n",
    "    (\"model\", LinearRegression())\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "6a7f0d65",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Failed'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Fails without specifying ignore of x4 thru 8\n",
    "try_fit(pipeline_fe, X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "afcae912",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x1_a', 'x2_g', 'x2_b', 'x2_c']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Because extra, unencoded features make it through\n",
    "show_features_fe(pipeline_fe, X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "06f521c1",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "67c0ed85",
   "metadata": {},
   "source": [
    "## Must either explicitly drop beforehand..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "909d259c",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_cleaned = X.drop(columns=[\"x4\", \"x5\", \"x6\", \"x7\", \"x8\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "80ac8f4e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Fit succeeded'"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Fit succeeds\n",
    "try_fit(pipeline_fe, X_cleaned, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "88ead019",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['x3', 'x1_a', 'x2_g', 'x2_b', 'x2_c']"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Because unused features were explicitly dropped beforehand\n",
    "show_features_fe(pipeline_fe, X_cleaned)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "53b62883",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "1cfb8252",
   "metadata": {},
   "source": [
    "## ...Or explicitly use DropFeatures()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "cfb3cc30",
   "metadata": {},
   "outputs": [],
   "source": [
    "pipeline_fe_2 = Pipeline(steps=[\n",
    "    (\"transforms\", Pipeline(steps=[\n",
    "        (\"features_cat\", OneHotEncoderFeatureEngine(drop_last=True, variables=[\"x1\", \"x2\"])),\n",
    "        (\"features_num\", SklearnTransformerWrapper(transformer=StandardScaler(), variables=[\"x3\"])),\n",
    "        (\"drop_unused\", DropFeatures(features_to_drop=[\"x4\", \"x5\", \"x6\", \"x7\", \"x8\"]))\n",
    "    ])),\n",
    "    (\"model\", LinearRegression())\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "cd035171",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Fit succeeded'"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Fit succeesd\n",
    "try_fit(pipeline_fe_2, X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "cf1df2c1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['x3', 'x1_a', 'x2_g', 'x2_b', 'x2_c']"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Because extra, unencoded features were dropped explicitly in pipeline\n",
    "show_features_fe(pipeline_fe_2, X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9b9d4bee",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "457c457e",
   "metadata": {},
   "source": [
    "## Either of these approaches requires foreknowledge of which features to get rid of\n",
    "## This could be difficult in a dynamic setting, particularly deployment "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e8a6bc40",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ef3b8e86",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"id": "21412476",
	"metadata": {},
	"source": [
	"# Sample Feature Drop "
	]
	},
	{
	"cell_type": "markdown",
	"id": "26352253",
	"metadata": {},
	"source": [
	"## Imports"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "4d60ac5e",
	"metadata": {},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import numpy as np\n",
	"\n",
	"from sklearn.compose import ColumnTransformer\n",
	"from sklearn.pipeline import Pipeline\n",
	"from sklearn.preprocessing import OneHotEncoder as OneHotEncoderSklearn\n",
	"from sklearn.preprocessing import StandardScaler\n",
	"from sklearn.linear_model import LinearRegression\n",
	"\n",
	"from feature_engine.encoding import OneHotEncoder as OneHotEncoderFeatureEngine\n",
	"from feature_engine.wrappers import SklearnTransformerWrapper\n",
	"from feature_engine.selection import DropFeatures"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "9c0771ec",
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "markdown",
	"id": "8ccac235",
	"metadata": {},
	"source": [
	"## Helper functions"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "a455335e",
	"metadata": {},
	"outputs": [],
	"source": [
	"def try_fit(pipeline: Pipeline, X, y):\n",
	" success: bool = True\n",
	" try:\n",
	" pipeline.fit(X, y)\n",
	" except Exception as e:\n",
	" success = False\n",
	" return \"Fit succeeded\" if success else \"Failed\"\n",
	" \n",
	"def show_features_sk(pipeline: Pipeline):\n",
	" return pipeline[\"transforms\"].get_feature_names_out().tolist()\n",
	"\n",
	"def show_features_fe(pipeline: Pipeline, X):\n",
	" return pipeline[\"transforms\"].transform(X).columns.tolist()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "aeedf510",
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "markdown",
	"id": "50112907",
	"metadata": {},
	"source": [
	"## Setup Sample DataFrame"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "c64a57bf",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>x1</th>\n",
	" <th>x2</th>\n",
	" <th>x3</th>\n",
	" <th>y</th>\n",
	" <th>x4</th>\n",
	" <th>x5</th>\n",
	" <th>x6</th>\n",
	" <th>x7</th>\n",
	" <th>x8</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>a</td>\n",
	" <td>g</td>\n",
	" <td>18.415773</td>\n",
	" <td>-1510.535719</td>\n",
	" <td>h</td>\n",
	" <td>c</td>\n",
	" <td>f</td>\n",
	" <td>i</td>\n",
	" <td>f</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>b</td>\n",
	" <td>g</td>\n",
	" <td>91.830742</td>\n",
	" <td>-1475.515643</td>\n",
	" <td>f</td>\n",
	" <td>h</td>\n",
	" <td>i</td>\n",
	" <td>j</td>\n",
	" <td>d</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>a</td>\n",
	" <td>b</td>\n",
	" <td>46.133657</td>\n",
	" <td>-1446.278123</td>\n",
	" <td>f</td>\n",
	" <td>b</td>\n",
	" <td>a</td>\n",
	" <td>b</td>\n",
	" <td>i</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>a</td>\n",
	" <td>c</td>\n",
	" <td>71.856817</td>\n",
	" <td>-1442.431217</td>\n",
	" <td>i</td>\n",
	" <td>j</td>\n",
	" <td>i</td>\n",
	" <td>c</td>\n",
	" <td>d</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>b</td>\n",
	" <td>d</td>\n",
	" <td>62.890009</td>\n",
	" <td>-1454.254753</td>\n",
	" <td>c</td>\n",
	" <td>d</td>\n",
	" <td>g</td>\n",
	" <td>i</td>\n",
	" <td>e</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" x1 x2 x3 y x4 x5 x6 x7 x8\n",
	"0 a g 18.415773 -1510.535719 h c f i f\n",
	"1 b g 91.830742 -1475.515643 f h i j d\n",
	"2 a b 46.133657 -1446.278123 f b a b i\n",
	"3 a c 71.856817 -1442.431217 i j i c d\n",
	"4 b d 62.890009 -1454.254753 c d g i e"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Sample DataFrame where y is a function of x1, x2, and x3.\n",
	"df_sample = pd.DataFrame({\n",
	" \"x1\": [\"a\", \"b\", \"a\", \"a\", \"b\"],\n",
	" \"x2\": [\"g\", \"g\", \"b\", \"c\", \"d\"],\n",
	" \"x3\": np.random.uniform(size=5)*100\n",
	"})\n",
	"df_sample = (\n",
	" df_sample.assign(y=lambda x: x.x1.apply(ord)-5 + x.x2.apply(ord)-10 + x.x3*0.5)\n",
	")\n",
	"df_sample.y = df_sample.y + [np.random.normal(0, 0.5)*10 for i in range(len(df_sample))]\n",
	"\n",
	"# x4 thru x8 are 5 random categorical columns we don't care about\n",
	"rand_vals = \"abcdefghij\"\n",
	"for i in range(5):\n",
	" df_sample[f\"x{i+4}\"] = [rand_vals[np.random.randint(len(rand_vals))] for j in range(len(df_sample))]\n",
	"df_sample"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "25c78c4a",
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "markdown",
	"id": "51e46d28",
	"metadata": {},
	"source": [
	"## X, y Split"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "8f596484",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>x1</th>\n",
	" <th>x2</th>\n",
	" <th>x3</th>\n",
	" <th>x4</th>\n",
	" <th>x5</th>\n",
	" <th>x6</th>\n",
	" <th>x7</th>\n",
	" <th>x8</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>a</td>\n",
	" <td>g</td>\n",
	" <td>18.415773</td>\n",
	" <td>h</td>\n",
	" <td>c</td>\n",
	" <td>f</td>\n",
	" <td>i</td>\n",
	" <td>f</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>b</td>\n",
	" <td>g</td>\n",
	" <td>91.830742</td>\n",
	" <td>f</td>\n",
	" <td>h</td>\n",
	" <td>i</td>\n",
	" <td>j</td>\n",
	" <td>d</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>a</td>\n",
	" <td>b</td>\n",
	" <td>46.133657</td>\n",
	" <td>f</td>\n",
	" <td>b</td>\n",
	" <td>a</td>\n",
	" <td>b</td>\n",
	" <td>i</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>a</td>\n",
	" <td>c</td>\n",
	" <td>71.856817</td>\n",
	" <td>i</td>\n",
	" <td>j</td>\n",
	" <td>i</td>\n",
	" <td>c</td>\n",
	" <td>d</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>b</td>\n",
	" <td>d</td>\n",
	" <td>62.890009</td>\n",
	" <td>c</td>\n",
	" <td>d</td>\n",
	" <td>g</td>\n",
	" <td>i</td>\n",
	" <td>e</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" x1 x2 x3 x4 x5 x6 x7 x8\n",
	"0 a g 18.415773 h c f i f\n",
	"1 b g 91.830742 f h i j d\n",
	"2 a b 46.133657 f b a b i\n",
	"3 a c 71.856817 i j i c d\n",
	"4 b d 62.890009 c d g i e"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# We don't filter out x4 thru x8, idea is there could be many non-feature columns we are unaware of \n",
	"X: pd.DataFrame = df_sample.drop(columns=\"y\")\n",
	"y: pd.Series = df_sample[\"y\"]\n",
	"X"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "77d24c3b",
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "markdown",
	"id": "04de501f",
	"metadata": {},
	"source": [
	"## Pipeline using sklearn ColumnTransformer\n",
	"### Automatic ignoring of non-features and successful fit"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "efb56a98",
	"metadata": {},
	"outputs": [],
	"source": [
	"# Pipeline using sklearn ColumnTransformer\n",
	"pipeline_sk = Pipeline(steps=[\n",
	" (\n",
	" \"transforms\",\n",
	" ColumnTransformer(\n",
	" transformers=[\n",
	" (\"features_cat\", OneHotEncoderSklearn(drop=\"first\"), [\"x1\", \"x2\"]),\n",
	" (\"features_num\", StandardScaler(), [\"x3\"])\n",
	" ],\n",
	" \n",
	" # Key: this automatically drops unspecified columns for us\n",
	" remainder=\"drop\"\n",
	" )\n",
	" ),\n",
	" (\"model\", LinearRegression())\n",
	"])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"id": "eb4805a9",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'Fit succeeded'"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Works without explicitly specifying ignore of x4 thru x8\n",
	"try_fit(pipeline_sk, X, y)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"id": "c886fce8",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['features_cat__x1_b',\n",
	" 'features_cat__x2_c',\n",
	" 'features_cat__x2_d',\n",
	" 'features_cat__x2_g',\n",
	" 'features_num__x3']"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Features automatically filtered\n",
	"show_features_sk(pipeline_sk)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "800fa29c",
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "markdown",
	"id": "c745ecaa",
	"metadata": {},
	"source": [
	"## Pipeline using feature-engine transformers, no ColumnTransformer\n",
	"### Fails because cannot automatically filter non-features, which make it through unencoded to model"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"id": "495ff2f7",
	"metadata": {},
	"outputs": [],
	"source": [
	"# How to do with feature-engine, and not use ColumnTransformer?\n",
	"# (We use nested Pipeline for transforms to make it easy to show features used)\n",
	"pipeline_fe = Pipeline(steps=[\n",
	" (\"transforms\", Pipeline(steps=[\n",
	" (\"features_cat\", OneHotEncoderFeatureEngine(drop_last=True, variables=[\"x1\", \"x2\"])),\n",
	" (\"features_num\", SklearnTransformerWrapper(transformer=StandardScaler(), variables=[\"x3\"]))\n",
	" ])),\n",
	" (\"model\", LinearRegression())\n",
	"])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"id": "6a7f0d65",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'Failed'"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Fails without specifying ignore of x4 thru 8\n",
	"try_fit(pipeline_fe, X, y)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"id": "afcae912",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x1_a', 'x2_g', 'x2_b', 'x2_c']"
	]
	},
	"execution_count": 10,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Because extra, unencoded features make it through\n",
	"show_features_fe(pipeline_fe, X)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "06f521c1",
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "markdown",
	"id": "67c0ed85",
	"metadata": {},
	"source": [
	"## Must either explicitly drop beforehand..."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"id": "909d259c",
	"metadata": {},
	"outputs": [],
	"source": [
	"X_cleaned = X.drop(columns=[\"x4\", \"x5\", \"x6\", \"x7\", \"x8\"])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"id": "80ac8f4e",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'Fit succeeded'"
	]
	},
	"execution_count": 12,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Fit succeeds\n",
	"try_fit(pipeline_fe, X_cleaned, y)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"id": "88ead019",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['x3', 'x1_a', 'x2_g', 'x2_b', 'x2_c']"
	]
	},
	"execution_count": 13,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Because unused features were explicitly dropped beforehand\n",
	"show_features_fe(pipeline_fe, X_cleaned)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "53b62883",
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "markdown",
	"id": "1cfb8252",
	"metadata": {},
	"source": [
	"## ...Or explicitly use DropFeatures()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"id": "cfb3cc30",
	"metadata": {},
	"outputs": [],
	"source": [
	"pipeline_fe_2 = Pipeline(steps=[\n",
	" (\"transforms\", Pipeline(steps=[\n",
	" (\"features_cat\", OneHotEncoderFeatureEngine(drop_last=True, variables=[\"x1\", \"x2\"])),\n",
	" (\"features_num\", SklearnTransformerWrapper(transformer=StandardScaler(), variables=[\"x3\"])),\n",
	" (\"drop_unused\", DropFeatures(features_to_drop=[\"x4\", \"x5\", \"x6\", \"x7\", \"x8\"]))\n",
	" ])),\n",
	" (\"model\", LinearRegression())\n",
	"])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"id": "cd035171",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'Fit succeeded'"
	]
	},
	"execution_count": 15,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Fit succeesd\n",
	"try_fit(pipeline_fe_2, X, y)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"id": "cf1df2c1",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['x3', 'x1_a', 'x2_g', 'x2_b', 'x2_c']"
	]
	},
	"execution_count": 16,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Because extra, unencoded features were dropped explicitly in pipeline\n",
	"show_features_fe(pipeline_fe_2, X)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "9b9d4bee",
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "markdown",
	"id": "457c457e",
	"metadata": {},
	"source": [
	"## Either of these approaches requires foreknowledge of which features to get rid of\n",
	"## This could be difficult in a dynamic setting, particularly deployment "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "e8a6bc40",
	"metadata": {},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "ef3b8e86",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.10"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}