lpekelis/Taming missing features at serving time.ipynb

## Taming missing features at serving time.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import functools\n",
    "import toolz as T\n",
    "import itertools\n",
    "from typing import Dict, List, Optional, Tuple\n",
    "from tqdm import tqdm\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from plotnine import *\n",
    "import seaborn as sns\n",
    "from sklearn import impute, preprocessing, metrics\n",
    "from sklearn.base import BaseEstimator, TransformerMixin\n",
    "from sklearn.pipeline import FeatureUnion, Pipeline\n",
    "from sklearn.utils import shuffle\n",
    "from xgboost import XGBRegressor\n",
    "\n",
    "# color blind friendly palette\n",
    "cbbPalette = [\"#000000\", \"#E69F00\", \"#56B4E9\", \"#009E73\", \"#F0E442\", \"#0072B2\", \"#D55E00\", \"#CC79A7\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "NUM_SIMULATIONS = 100"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Introduction"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## The data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv('housetrain.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "numeric_columns = data.select_dtypes(['number']).columns\n",
    "data[numeric_columns] = data[numeric_columns].astype(float)\n",
    "data['SalePrice'] = data['SalePrice'].astype(int)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Serving occlusion (the problem)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Numeric Columns minus identifiers like Id or MSSubClass and clearly leaky features for predictinng SalePrice\n",
    "# like MoSold and YrSold\n",
    "features = [\n",
    "    '1stFlrSF',\n",
    "    '2ndFlrSF',\n",
    "    '3SsnPorch',\n",
    "    'BedroomAbvGr',\n",
    "    'BsmtFinSF1',\n",
    "    'BsmtFinSF2',\n",
    "    'BsmtFullBath',\n",
    "    'BsmtHalfBath',\n",
    "    'BsmtUnfSF',\n",
    "    'EnclosedPorch',\n",
    "    'Fireplaces',\n",
    "    'FullBath',\n",
    "    'GarageArea',\n",
    "    'GarageCars',\n",
    "    'GarageYrBlt',\n",
    "    'GrLivArea',\n",
    "    'HalfBath',\n",
    "    'KitchenAbvGr',\n",
    "    'LotArea',\n",
    "    'LotFrontage',\n",
    "    'LowQualFinSF',\n",
    "    'MasVnrArea',\n",
    "    'OpenPorchSF',\n",
    "    'OverallCond',\n",
    "    'OverallQual',\n",
    "    'PoolArea',\n",
    "    'ScreenPorch',\n",
    "    'TotRmsAbvGrd',\n",
    "    'TotalBsmtSF',\n",
    "    'WoodDeckSF',\n",
    "    'YearBuilt',\n",
    "    'YearRemodAdd'\n",
    "]\n",
    "\n",
    "label = 'SalePrice'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_estimator(features = List[str], **kwargs) -> BaseEstimator:\n",
    "    estimator = Pipeline(\n",
    "        [\n",
    "            (\"scaler\", preprocessing.StandardScaler()),\n",
    "            (\"imputer\", impute.SimpleImputer(missing_values=np.nan, strategy=\"mean\")),\n",
    "            (\"estimator\", XGBRegressor(\n",
    "                booster='gblinear',\n",
    "                objective='count:poisson',\n",
    "                base_score=data[label].median(),\n",
    "                **kwargs\n",
    "            ))\n",
    "        ]\n",
    "    )\n",
    "    estimator.features = features\n",
    "    return estimator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "estimator = build_estimator(features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "estimator.fit(data[estimator.features], data[label])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def coefs(estimator: Pipeline) -> pd.DataFrame:\n",
    "    return (\n",
    "        pd.DataFrame({\n",
    "            \"feature\": ['intercept'] + estimator.features, \n",
    "            \"coef\": (\n",
    "                estimator.steps[-1][1].intercept_.tolist() + \n",
    "                estimator.steps[-1][1].coef_.tolist()\n",
    "            )\n",
    "        })[['feature', 'coef']]\n",
    "        .assign(abs_coef = lambda df: df['coef'].abs())\n",
    "        .sort_values('abs_coef', ascending=False)\n",
    "        .drop('abs_coef', axis=1)\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "estimator2 = build_estimator(\n",
    "    features = [c for c in features if c is not 'OverallQual']\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "estimator2.fit(data[estimator2.features], data[label])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_pred = estimator.predict(data[estimator.features])\n",
    "\n",
    "label_pred2 = estimator2.predict(data[estimator2.features])\n",
    "\n",
    "MAE_with_OverallQual = metrics.mean_absolute_error(data[label], label_pred)\n",
    "MAE_without_OverallQual = metrics.mean_absolute_error(data[label], label_pred2)\n",
    "\n",
    "display(\n",
    "    \"% Improvement MAE with OverallQual\",\n",
    "    round((MAE_without_OverallQual - MAE_with_OverallQual) / MAE_without_OverallQual * 100)\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## If more than ~30% of `OverallQual` feature is occluded at serving time, it's better to not include it at all in a naive estimator."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def fit_naive_estimator(train_set: pd.DataFrame, features: List[str], label: str, **kwargs) -> BaseEstimator:\n",
    "    estimator = build_estimator(features=features, **kwargs)\n",
    "    return estimator.fit(train_set[estimator.features], train_set[label])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def evaluate(test_set: pd.DataFrame, estimator: BaseEstimator, label: str, do_mae: bool=True) -> float:\n",
    "    preds = pd.Series(estimator.predict(test_set[estimator.features]), index=test_set.index)\n",
    "    if do_mae:\n",
    "        return metrics.mean_absolute_error(test_set[label], preds)\n",
    "    return preds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def mask_feature(x: pd.Series, p: float) -> pd.Series:\n",
    "    \"\"\"\n",
    "    Mask first p proportion of feature values with np.nan.\n",
    "    \"\"\"\n",
    "    return x.mask(x.reset_index().index <= int(p * x.size))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def evaluate_with_missing(\n",
    "    feature_to_mask: str=\"OverallQual\", \n",
    "    data: pd.DataFrame=data,\n",
    "    train_func: callable = fit_naive_estimator,\n",
    "    eval_func: callable = evaluate,\n",
    "    features: List[str]=features, \n",
    "    label: str=label, \n",
    "    missing_percents: List[float] = np.arange(0,1,.1),\n",
    "    random_state: Optional[int] = None,\n",
    ") -> Dict[float, float]:\n",
    "    data = shuffle(data, random_state=random_state)\n",
    "    if random_state:\n",
    "        estimator = train_func(train_set=data, features=features, label=label, random_state=random_state)\n",
    "    else:\n",
    "        estimator = train_func(train_set=data, features=features, label=label)\n",
    "    return {\n",
    "        p: eval_func(\n",
    "            test_set=data.assign(**{feature_to_mask: lambda df,p=p: mask_feature(df[feature_to_mask],p)}),\n",
    "            estimator=estimator,\n",
    "            label=label,\n",
    "        ) \n",
    "        for p in missing_percents\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "missing_maes = [\n",
    "    evaluate_with_missing()\n",
    "    for _ in tqdm(range(NUM_SIMULATIONS), total=NUM_SIMULATIONS) \n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def _format_MAE_plot(p, title) -> ggplot:\n",
    "        return (\n",
    "            p\n",
    "            + geom_hline(\n",
    "                yintercept=MAE_without_OverallQual,\n",
    "                color='orange'\n",
    "            )\n",
    "            + theme_bw()\n",
    "            + labs(\n",
    "                title=title,\n",
    "                x=\"Percent `OverallQual` occluded during serving.\",\n",
    "            )\n",
    "            + geom_label(\n",
    "                x=10, \n",
    "                y=19000, \n",
    "                label=\"Orange line = MAE without `OverallQual` feature.\",\n",
    "                ha=\"right\",\n",
    "                va=\"top\",\n",
    "                size=8,\n",
    "                label_size=0.5,\n",
    "                fill='white',\n",
    "            )\n",
    "        )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "p = (\n",
    "    pd.DataFrame(missing_maes)\n",
    "    .pipe(pd.melt, var_name='percent_missing', value_name='MAE')\n",
    "    .assign(percent_missing = lambda df: df['percent_missing'].round(1).astype(str))\n",
    "    .pipe(\n",
    "        lambda df:\n",
    "        ggplot(df, aes(x='percent_missing', y='MAE'))\n",
    "        + geom_violin(draw_quantiles=[.5], scale='width', fill=cbbPalette[2])\n",
    "    )\n",
    ")\n",
    "\n",
    "_format_MAE_plot(p, title=\"Serving occlusion results in poor performance for a naive estimator\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Build multiple models (solution 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.set(rc={'figure.figsize':(10,1)})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "T.pipe(\n",
    "    sorted(features + [label]),\n",
    "    lambda columns: data[columns].corr()[[\"OverallQual\"]].T,\n",
    "    lambda df: sns.heatmap(\n",
    "        df,\n",
    "        xticklabels=df.columns,\n",
    "        yticklabels=df.index,\n",
    "    )\n",
    ");"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def fit_estimator_soln1(\n",
    "    train_set: pd.DataFrame, \n",
    "    features: List[str], \n",
    "    label: str, \n",
    "    occluded_features: List[str]=[\"OverallQual\"], \n",
    "    **kwargs\n",
    ") -> Tuple[BaseEstimator, BaseEstimator]:\n",
    "    estimator = build_estimator(features=features, **kwargs)\n",
    "    estimator_occluded = build_estimator(features=list(set(features) - set(occluded_features)), **kwargs)\n",
    "    return (\n",
    "        estimator.fit(train_set[estimator.features], train_set[label]),\n",
    "        estimator_occluded.fit(train_set[estimator_occluded.features], train_set[label])\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def evaluate_soln1(\n",
    "    test_set: pd.DataFrame,\n",
    "    estimator: Tuple[BaseEstimator, BaseEstimator],\n",
    "    label: str,\n",
    "    occluded_features: List[str]=[\"OverallQual\"],\n",
    "    do_mae: bool=True,\n",
    ") -> float:\n",
    "    missing_index = test_set[occluded_features].isnull().any(axis=1)\n",
    "    label_pred = f\"{label}_pred\"\n",
    "    preds = pd.concat([\n",
    "        pd.Series(est.predict(test_set.loc[idx][est.features]), index=test_set.index[idx])\n",
    "        if idx.sum() else pd.Series()\n",
    "        for est, idx in zip(estimator, [~missing_index, missing_index])\n",
    "    ]).rename(label_pred)\n",
    "    \n",
    "    return T.pipe(\n",
    "        pd.concat([test_set[label], preds], axis=1),\n",
    "        lambda df: metrics.mean_absolute_error(df[label], df[label_pred]) if do_mae else df[label_pred],\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "missing_maes_soln1 = [\n",
    "    evaluate_with_missing(train_func=fit_estimator_soln1, eval_func=evaluate_soln1)\n",
    "    for _ in tqdm(range(NUM_SIMULATIONS), total=NUM_SIMULATIONS) \n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "p = (\n",
    "    pd.concat(\n",
    "        {\n",
    "            \"naive\": pd.DataFrame(missing_maes),\n",
    "            \"multiple_models\": pd.DataFrame(missing_maes_soln1),\n",
    "        }, \n",
    "        names=[\"Solution\"]\n",
    "    )\n",
    "    .reset_index(\"Solution\")\n",
    "    .pipe(pd.melt, var_name=\"percent_missing\", value_name=\"MAE\", id_vars=[\"Solution\"])\n",
    "    .assign(percent_missing = lambda df: df[\"percent_missing\"].astype(float).round(1).astype(str))\n",
    "    .pipe(\n",
    "        lambda df:\n",
    "        ggplot(df, aes(x=\"percent_missing\", y=\"MAE\", fill=\"Solution\"))\n",
    "        + geom_violin(draw_quantiles=[.5], scale=\"width\")\n",
    "        + scale_fill_manual(values=[cbbPalette[i] for i in [6,2]])\n",
    "    )\n",
    ");\n",
    "\n",
    "_format_MAE_plot(p, title=\"Multiple models estimator recovers performance\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data stacking (another, better solution)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def _starts_with(x: List, start: List) -> bool:\n",
    "    return not set(x[:len(start)]) - set(start)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class PipelineWithOcclusionStacking(Pipeline):\n",
    "    def __init__(\n",
    "        self, steps, memory=None, occluded_features=List[str]\n",
    "    ) -> None:\n",
    "        self._occluded_features = occluded_features\n",
    "        super().__init__(steps=steps, memory=memory)\n",
    "    \n",
    "    def fit(self, X: pd.DataFrame, y: pd.Series, **fit_params):\n",
    "        \"\"\"Stack synthetic, occluded data and fit the model.\"\"\"\n",
    "        assert isinstance(X, pd.DataFrame), \"The input data must be a `pd.DataFrame`.\"\n",
    "        assert _starts_with(X.columns, self._occluded_features), \"Occluded features must be first columns in input data.\"\n",
    "        \n",
    "        X_stacked = pd.concat([X, X.assign(**{feature: np.nan for feature in self._occluded_features})], ignore_index=True)\n",
    "        y_stacked = pd.concat([y, y], ignore_index=True)\n",
    "        return super().fit(X_stacked, y_stacked, **fit_params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_stacked_estimator(features: List[str], occluded_features: List[str], **kwargs) -> BaseEstimator:\n",
    "    # Estimator requires the first len(occluded_features) after len(features) to be occlusion indicators\n",
    "    if not _starts_with(features, occluded_features):\n",
    "        print(\"`features` does not start with `occluded_features`, reordering...\")\n",
    "        features = occluded_features + [f for f in features if not f in occluded_features]\n",
    "    \n",
    "    base_columns_index=[i for i, f in enumerate(features) if not f in occluded_features]\n",
    "    interaction_columns_index=[len(features) + i for i in range(len(occluded_features))]\n",
    "    \n",
    "    def _interact_with_occlusion_indicators(X: np.ndarray) -> np.ndarray:\n",
    "        def _multiply_cols(df: pd.DataFrame, base_idx: int, interaction_idx: int) -> pd.Series:\n",
    "            return df.iloc[:, base_idx] * df.iloc[:, interaction_idx]\n",
    "\n",
    "        return pd.DataFrame(X).assign(\n",
    "            **{\n",
    "                f\"{base_idx}_{interaction_idx}\": functools.partial(\n",
    "                    _multiply_cols, base_idx=base_idx, interaction_idx=interaction_idx\n",
    "                )\n",
    "                for base_idx, interaction_idx in itertools.product(\n",
    "                    base_columns_index, interaction_columns_index\n",
    "                )\n",
    "            }\n",
    "        ).values\n",
    "    \n",
    "    def _select_occluded_features(X: np.ndarray) -> np.ndarray:\n",
    "        \"\"\"\n",
    "        Note: requires the first len(occluded_features) after len(features) to be occlusion indicators\n",
    "        \"\"\"\n",
    "        return X[:, :len(occluded_features)]\n",
    "        \n",
    "    estimator = PipelineWithOcclusionStacking(\n",
    "        [\n",
    "            (\"scaler\", preprocessing.StandardScaler()),\n",
    "            (\"imputer\", FeatureUnion(\n",
    "                transformer_list=[\n",
    "                    ('features', impute.SimpleImputer(missing_values=np.nan, strategy='mean')),\n",
    "                    ('indicators', Pipeline(\n",
    "                        [\n",
    "                            ('selector', preprocessing.FunctionTransformer(_select_occluded_features, validate=False)),\n",
    "                            ('indicator', impute.MissingIndicator())\n",
    "                        ]\n",
    "                    ))\n",
    "                ]\n",
    "            )),\n",
    "            (\"interactor\", preprocessing.FunctionTransformer(_interact_with_occlusion_indicators, validate=False)),\n",
    "            (\"estimator\", XGBRegressor(\n",
    "                booster='gblinear',\n",
    "                objective='count:poisson',\n",
    "                base_score=data[label].median(),\n",
    "                **kwargs\n",
    "            ))\n",
    "        ],\n",
    "        occluded_features = occluded_features\n",
    "    )\n",
    "    estimator.features = features\n",
    "    return estimator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def fit_estimator_soln2(\n",
    "    train_set: pd.DataFrame, \n",
    "    features: List[str], \n",
    "    label: str, \n",
    "    occluded_features: List[str]=[\"OverallQual\"], \n",
    "    **kwargs\n",
    ") -> BaseEstimator:\n",
    "    estimator = build_stacked_estimator(features=features, occluded_features=occluded_features, **kwargs)\n",
    "    return estimator.fit(train_set[estimator.features], train_set[label])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "missing_maes_soln2 = [\n",
    "    evaluate_with_missing(train_func=fit_estimator_soln2)\n",
    "    for _ in tqdm(range(NUM_SIMULATIONS), total=NUM_SIMULATIONS) \n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "p = (\n",
    "    pd.concat(\n",
    "        {\n",
    "            \"naive\": pd.DataFrame(missing_maes),\n",
    "            \"multiple_models\": pd.DataFrame(missing_maes_soln1),\n",
    "            \"data_stacking\": pd.DataFrame(missing_maes_soln2)\n",
    "        }, \n",
    "        names=[\"Solution\"]\n",
    "    )\n",
    "    .reset_index(\"Solution\")\n",
    "    .pipe(pd.melt, var_name=\"percent_missing\", value_name=\"MAE\", id_vars=[\"Solution\"])\n",
    "    .assign(percent_missing = lambda df: df[\"percent_missing\"].astype(float).round(1).astype(str))\n",
    "    .pipe(\n",
    "        lambda df:\n",
    "        ggplot(df, aes(x=\"percent_missing\", y=\"MAE\", fill=\"Solution\"))\n",
    "        + geom_violin(draw_quantiles=[.5], scale=\"width\")\n",
    "        + scale_fill_manual(values=[cbbPalette[i] for i in [3,6,2]])\n",
    "    )\n",
    ");\n",
    "\n",
    "_format_MAE_plot(p, title=\"Data stacking performance is identical to multiple models\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Conclusion"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Appendix"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Other simulations"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Compare preds of two solutions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "occluded_features = [\"OverallQual\"]\n",
    "stacked_features = occluded_features + [f for f in features if not f in occluded_features]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "compare_preds = [\n",
    "    T.pipe(\n",
    "        np.random.randint(np.iinfo(np.int32).max),\n",
    "        lambda random_state:\n",
    "        (\n",
    "            pd.DataFrame(\n",
    "                evaluate_with_missing(\n",
    "                    train_func=fit_estimator_soln1,\n",
    "                    eval_func=functools.partial(evaluate_soln1, do_mae=False),\n",
    "                    random_state=random_state,\n",
    "                    features=stacked_features,\n",
    "                )\n",
    "            ) -\n",
    "            pd.DataFrame(\n",
    "                evaluate_with_missing(\n",
    "                    train_func=fit_estimator_soln2, \n",
    "                    eval_func=functools.partial(evaluate, do_mae=False),\n",
    "                    random_state=random_state,\n",
    "                    features=stacked_features,\n",
    "                )\n",
    "            )\n",
    "        )\n",
    "    )\n",
    "        for _ in tqdm(range(NUM_SIMULATIONS), total=NUM_SIMULATIONS)\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    pd.concat(compare_preds)\n",
    "    .pipe(pd.melt, var_name=\"percent_missing\", value_name=\"solution_1_2_pred_difference\") \n",
    "    .pipe(\n",
    "        lambda df:\n",
    "        ggplot(df, aes(x=\"solution_1_2_pred_difference\", group=\"percent_missing\", color=\"percent_missing\")) \n",
    "        + stat_ecdf()\n",
    "        + xlim(-5000, 5000)\n",
    "        + theme_bw()\n",
    "        + scale_y_continuous(breaks=[0,.1,.25,.5,.75,.9,1])\n",
    "        + labs(\n",
    "            color=\"% `OverallQual` \\n occluded during serving.\",\n",
    "            y=\"Percentile\",\n",
    "            x=\"Multiple models - Data stacking prediction difference\"\n",
    "        )\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Compare coefficients of two solutions\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compare_coefficients(\n",
    "    data: pd.DataFrame, estimator_funcs: Tuple[callable, callable], label: str=label, resample: bool=True\n",
    ") -> pd.DataFrame:\n",
    "    if resample:\n",
    "        data = data.sample(frac=1.0, replace=True)\n",
    "        random_state = np.random.randint(np.iinfo(np.int32).max)\n",
    "    else:\n",
    "        random_state = None\n",
    "\n",
    "    def _fitted_coefs(estimator_func: callable) -> pd.Series:\n",
    "        estimator = estimator_func(random_state=random_state)\n",
    "        \n",
    "        return T.pipe(\n",
    "            estimator.fit(data[estimator.features], data[label]),\n",
    "            lambda est: pd.concat(\n",
    "                [\n",
    "                    pd.Series(est.steps[-1][1].coef_), pd.Series(est.named_steps['scaler'].var_)\n",
    "                ], axis=1\n",
    "            ),\n",
    "            lambda df: df[0] / np.sqrt(df[1])\n",
    "        )\n",
    "    \n",
    "    return pd.concat([_fitted_coefs(fun) for fun in estimator_funcs], axis=1).T.pipe(\n",
    "        lambda df: df.diff().loc[1,:] / df.mean(axis=0)\n",
    "    ).dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "coefficient_comparisons = [\n",
    "    compare_coefficients(\n",
    "        data=data, \n",
    "        estimator_funcs = [\n",
    "            lambda random_state: build_stacked_estimator(stacked_features, occluded_features, random_state=random_state),\n",
    "            lambda random_state: build_estimator(stacked_features, random_state=random_state)\n",
    "        ]\n",
    "    )\n",
    "    for _ in tqdm(range(NUM_SIMULATIONS))\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    pd.concat(coefficient_comparisons, axis=1).T\n",
    "    .rename(columns={i: f for i,f in enumerate(stacked_features)})\n",
    "    .pipe(pd.melt, var_name=\"feature\", value_name=\"normalized_coefficient_difference\") \n",
    "    .pipe(\n",
    "        lambda df:\n",
    "        ggplot(df, aes(x=\"normalized_coefficient_difference\")) \n",
    "        + stat_ecdf()\n",
    "        + scale_x_continuous(breaks=[-1, -0.5, -0.2, 0, 0.2, 0.5, 1], limits=[-1,1])\n",
    "        + scale_y_continuous(breaks=[0, 0.1, 0.25, 0.5, 0.75, 0.9, 1.0])\n",
    "        + theme_bw()\n",
    "        + labs(\n",
    "            x = \"Soln 1 - Soln 2 coefficient difference, scaled by average coefficient value\",\n",
    "            y = \"Percentile\"\n",
    "        )\n",
    "    )\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "hide_input": false,
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}