Skip to content

Instantly share code, notes, and snippets.

@lpekelis
Last active April 12, 2020 00:46
Show Gist options
  • Save lpekelis/690ce481f98ed5c2e793c40b6755f4ea to your computer and use it in GitHub Desktop.
Save lpekelis/690ce481f98ed5c2e793c40b6755f4ea to your computer and use it in GitHub Desktop.
Code for reproducing results of this Opendoor technical blog post, https://www.opendoor.com/w/blog/taming-missing-features-at-serving-time. We're hiring!
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import functools\n",
"import toolz as T\n",
"import itertools\n",
"from typing import Dict, List, Optional, Tuple\n",
"from tqdm import tqdm\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"from plotnine import *\n",
"import seaborn as sns\n",
"from sklearn import impute, preprocessing, metrics\n",
"from sklearn.base import BaseEstimator, TransformerMixin\n",
"from sklearn.pipeline import FeatureUnion, Pipeline\n",
"from sklearn.utils import shuffle\n",
"from xgboost import XGBRegressor\n",
"\n",
"# color blind friendly palette\n",
"cbbPalette = [\"#000000\", \"#E69F00\", \"#56B4E9\", \"#009E73\", \"#F0E442\", \"#0072B2\", \"#D55E00\", \"#CC79A7\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"NUM_SIMULATIONS = 100"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Introduction"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## The data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv('housetrain.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"numeric_columns = data.select_dtypes(['number']).columns\n",
"data[numeric_columns] = data[numeric_columns].astype(float)\n",
"data['SalePrice'] = data['SalePrice'].astype(int)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Serving occlusion (the problem)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Numeric Columns minus identifiers like Id or MSSubClass and clearly leaky features for predictinng SalePrice\n",
"# like MoSold and YrSold\n",
"features = [\n",
" '1stFlrSF',\n",
" '2ndFlrSF',\n",
" '3SsnPorch',\n",
" 'BedroomAbvGr',\n",
" 'BsmtFinSF1',\n",
" 'BsmtFinSF2',\n",
" 'BsmtFullBath',\n",
" 'BsmtHalfBath',\n",
" 'BsmtUnfSF',\n",
" 'EnclosedPorch',\n",
" 'Fireplaces',\n",
" 'FullBath',\n",
" 'GarageArea',\n",
" 'GarageCars',\n",
" 'GarageYrBlt',\n",
" 'GrLivArea',\n",
" 'HalfBath',\n",
" 'KitchenAbvGr',\n",
" 'LotArea',\n",
" 'LotFrontage',\n",
" 'LowQualFinSF',\n",
" 'MasVnrArea',\n",
" 'OpenPorchSF',\n",
" 'OverallCond',\n",
" 'OverallQual',\n",
" 'PoolArea',\n",
" 'ScreenPorch',\n",
" 'TotRmsAbvGrd',\n",
" 'TotalBsmtSF',\n",
" 'WoodDeckSF',\n",
" 'YearBuilt',\n",
" 'YearRemodAdd'\n",
"]\n",
"\n",
"label = 'SalePrice'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def build_estimator(features = List[str], **kwargs) -> BaseEstimator:\n",
" estimator = Pipeline(\n",
" [\n",
" (\"scaler\", preprocessing.StandardScaler()),\n",
" (\"imputer\", impute.SimpleImputer(missing_values=np.nan, strategy=\"mean\")),\n",
" (\"estimator\", XGBRegressor(\n",
" booster='gblinear',\n",
" objective='count:poisson',\n",
" base_score=data[label].median(),\n",
" **kwargs\n",
" ))\n",
" ]\n",
" )\n",
" estimator.features = features\n",
" return estimator"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"estimator = build_estimator(features)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"estimator.fit(data[estimator.features], data[label])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def coefs(estimator: Pipeline) -> pd.DataFrame:\n",
" return (\n",
" pd.DataFrame({\n",
" \"feature\": ['intercept'] + estimator.features, \n",
" \"coef\": (\n",
" estimator.steps[-1][1].intercept_.tolist() + \n",
" estimator.steps[-1][1].coef_.tolist()\n",
" )\n",
" })[['feature', 'coef']]\n",
" .assign(abs_coef = lambda df: df['coef'].abs())\n",
" .sort_values('abs_coef', ascending=False)\n",
" .drop('abs_coef', axis=1)\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"estimator2 = build_estimator(\n",
" features = [c for c in features if c is not 'OverallQual']\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"estimator2.fit(data[estimator2.features], data[label])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"label_pred = estimator.predict(data[estimator.features])\n",
"\n",
"label_pred2 = estimator2.predict(data[estimator2.features])\n",
"\n",
"MAE_with_OverallQual = metrics.mean_absolute_error(data[label], label_pred)\n",
"MAE_without_OverallQual = metrics.mean_absolute_error(data[label], label_pred2)\n",
"\n",
"display(\n",
" \"% Improvement MAE with OverallQual\",\n",
" round((MAE_without_OverallQual - MAE_with_OverallQual) / MAE_without_OverallQual * 100)\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## If more than ~30% of `OverallQual` feature is occluded at serving time, it's better to not include it at all in a naive estimator."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def fit_naive_estimator(train_set: pd.DataFrame, features: List[str], label: str, **kwargs) -> BaseEstimator:\n",
" estimator = build_estimator(features=features, **kwargs)\n",
" return estimator.fit(train_set[estimator.features], train_set[label])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def evaluate(test_set: pd.DataFrame, estimator: BaseEstimator, label: str, do_mae: bool=True) -> float:\n",
" preds = pd.Series(estimator.predict(test_set[estimator.features]), index=test_set.index)\n",
" if do_mae:\n",
" return metrics.mean_absolute_error(test_set[label], preds)\n",
" return preds"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def mask_feature(x: pd.Series, p: float) -> pd.Series:\n",
" \"\"\"\n",
" Mask first p proportion of feature values with np.nan.\n",
" \"\"\"\n",
" return x.mask(x.reset_index().index <= int(p * x.size))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def evaluate_with_missing(\n",
" feature_to_mask: str=\"OverallQual\", \n",
" data: pd.DataFrame=data,\n",
" train_func: callable = fit_naive_estimator,\n",
" eval_func: callable = evaluate,\n",
" features: List[str]=features, \n",
" label: str=label, \n",
" missing_percents: List[float] = np.arange(0,1,.1),\n",
" random_state: Optional[int] = None,\n",
") -> Dict[float, float]:\n",
" data = shuffle(data, random_state=random_state)\n",
" if random_state:\n",
" estimator = train_func(train_set=data, features=features, label=label, random_state=random_state)\n",
" else:\n",
" estimator = train_func(train_set=data, features=features, label=label)\n",
" return {\n",
" p: eval_func(\n",
" test_set=data.assign(**{feature_to_mask: lambda df,p=p: mask_feature(df[feature_to_mask],p)}),\n",
" estimator=estimator,\n",
" label=label,\n",
" ) \n",
" for p in missing_percents\n",
" }"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"missing_maes = [\n",
" evaluate_with_missing()\n",
" for _ in tqdm(range(NUM_SIMULATIONS), total=NUM_SIMULATIONS) \n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def _format_MAE_plot(p, title) -> ggplot:\n",
" return (\n",
" p\n",
" + geom_hline(\n",
" yintercept=MAE_without_OverallQual,\n",
" color='orange'\n",
" )\n",
" + theme_bw()\n",
" + labs(\n",
" title=title,\n",
" x=\"Percent `OverallQual` occluded during serving.\",\n",
" )\n",
" + geom_label(\n",
" x=10, \n",
" y=19000, \n",
" label=\"Orange line = MAE without `OverallQual` feature.\",\n",
" ha=\"right\",\n",
" va=\"top\",\n",
" size=8,\n",
" label_size=0.5,\n",
" fill='white',\n",
" )\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"p = (\n",
" pd.DataFrame(missing_maes)\n",
" .pipe(pd.melt, var_name='percent_missing', value_name='MAE')\n",
" .assign(percent_missing = lambda df: df['percent_missing'].round(1).astype(str))\n",
" .pipe(\n",
" lambda df:\n",
" ggplot(df, aes(x='percent_missing', y='MAE'))\n",
" + geom_violin(draw_quantiles=[.5], scale='width', fill=cbbPalette[2])\n",
" )\n",
")\n",
"\n",
"_format_MAE_plot(p, title=\"Serving occlusion results in poor performance for a naive estimator\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Build multiple models (solution 1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sns.set(rc={'figure.figsize':(10,1)})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"T.pipe(\n",
" sorted(features + [label]),\n",
" lambda columns: data[columns].corr()[[\"OverallQual\"]].T,\n",
" lambda df: sns.heatmap(\n",
" df,\n",
" xticklabels=df.columns,\n",
" yticklabels=df.index,\n",
" )\n",
");"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def fit_estimator_soln1(\n",
" train_set: pd.DataFrame, \n",
" features: List[str], \n",
" label: str, \n",
" occluded_features: List[str]=[\"OverallQual\"], \n",
" **kwargs\n",
") -> Tuple[BaseEstimator, BaseEstimator]:\n",
" estimator = build_estimator(features=features, **kwargs)\n",
" estimator_occluded = build_estimator(features=list(set(features) - set(occluded_features)), **kwargs)\n",
" return (\n",
" estimator.fit(train_set[estimator.features], train_set[label]),\n",
" estimator_occluded.fit(train_set[estimator_occluded.features], train_set[label])\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def evaluate_soln1(\n",
" test_set: pd.DataFrame,\n",
" estimator: Tuple[BaseEstimator, BaseEstimator],\n",
" label: str,\n",
" occluded_features: List[str]=[\"OverallQual\"],\n",
" do_mae: bool=True,\n",
") -> float:\n",
" missing_index = test_set[occluded_features].isnull().any(axis=1)\n",
" label_pred = f\"{label}_pred\"\n",
" preds = pd.concat([\n",
" pd.Series(est.predict(test_set.loc[idx][est.features]), index=test_set.index[idx])\n",
" if idx.sum() else pd.Series()\n",
" for est, idx in zip(estimator, [~missing_index, missing_index])\n",
" ]).rename(label_pred)\n",
" \n",
" return T.pipe(\n",
" pd.concat([test_set[label], preds], axis=1),\n",
" lambda df: metrics.mean_absolute_error(df[label], df[label_pred]) if do_mae else df[label_pred],\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"missing_maes_soln1 = [\n",
" evaluate_with_missing(train_func=fit_estimator_soln1, eval_func=evaluate_soln1)\n",
" for _ in tqdm(range(NUM_SIMULATIONS), total=NUM_SIMULATIONS) \n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"p = (\n",
" pd.concat(\n",
" {\n",
" \"naive\": pd.DataFrame(missing_maes),\n",
" \"multiple_models\": pd.DataFrame(missing_maes_soln1),\n",
" }, \n",
" names=[\"Solution\"]\n",
" )\n",
" .reset_index(\"Solution\")\n",
" .pipe(pd.melt, var_name=\"percent_missing\", value_name=\"MAE\", id_vars=[\"Solution\"])\n",
" .assign(percent_missing = lambda df: df[\"percent_missing\"].astype(float).round(1).astype(str))\n",
" .pipe(\n",
" lambda df:\n",
" ggplot(df, aes(x=\"percent_missing\", y=\"MAE\", fill=\"Solution\"))\n",
" + geom_violin(draw_quantiles=[.5], scale=\"width\")\n",
" + scale_fill_manual(values=[cbbPalette[i] for i in [6,2]])\n",
" )\n",
");\n",
"\n",
"_format_MAE_plot(p, title=\"Multiple models estimator recovers performance\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data stacking (another, better solution)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def _starts_with(x: List, start: List) -> bool:\n",
" return not set(x[:len(start)]) - set(start)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class PipelineWithOcclusionStacking(Pipeline):\n",
" def __init__(\n",
" self, steps, memory=None, occluded_features=List[str]\n",
" ) -> None:\n",
" self._occluded_features = occluded_features\n",
" super().__init__(steps=steps, memory=memory)\n",
" \n",
" def fit(self, X: pd.DataFrame, y: pd.Series, **fit_params):\n",
" \"\"\"Stack synthetic, occluded data and fit the model.\"\"\"\n",
" assert isinstance(X, pd.DataFrame), \"The input data must be a `pd.DataFrame`.\"\n",
" assert _starts_with(X.columns, self._occluded_features), \"Occluded features must be first columns in input data.\"\n",
" \n",
" X_stacked = pd.concat([X, X.assign(**{feature: np.nan for feature in self._occluded_features})], ignore_index=True)\n",
" y_stacked = pd.concat([y, y], ignore_index=True)\n",
" return super().fit(X_stacked, y_stacked, **fit_params)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def build_stacked_estimator(features: List[str], occluded_features: List[str], **kwargs) -> BaseEstimator:\n",
" # Estimator requires the first len(occluded_features) after len(features) to be occlusion indicators\n",
" if not _starts_with(features, occluded_features):\n",
" print(\"`features` does not start with `occluded_features`, reordering...\")\n",
" features = occluded_features + [f for f in features if not f in occluded_features]\n",
" \n",
" base_columns_index=[i for i, f in enumerate(features) if not f in occluded_features]\n",
" interaction_columns_index=[len(features) + i for i in range(len(occluded_features))]\n",
" \n",
" def _interact_with_occlusion_indicators(X: np.ndarray) -> np.ndarray:\n",
" def _multiply_cols(df: pd.DataFrame, base_idx: int, interaction_idx: int) -> pd.Series:\n",
" return df.iloc[:, base_idx] * df.iloc[:, interaction_idx]\n",
"\n",
" return pd.DataFrame(X).assign(\n",
" **{\n",
" f\"{base_idx}_{interaction_idx}\": functools.partial(\n",
" _multiply_cols, base_idx=base_idx, interaction_idx=interaction_idx\n",
" )\n",
" for base_idx, interaction_idx in itertools.product(\n",
" base_columns_index, interaction_columns_index\n",
" )\n",
" }\n",
" ).values\n",
" \n",
" def _select_occluded_features(X: np.ndarray) -> np.ndarray:\n",
" \"\"\"\n",
" Note: requires the first len(occluded_features) after len(features) to be occlusion indicators\n",
" \"\"\"\n",
" return X[:, :len(occluded_features)]\n",
" \n",
" estimator = PipelineWithOcclusionStacking(\n",
" [\n",
" (\"scaler\", preprocessing.StandardScaler()),\n",
" (\"imputer\", FeatureUnion(\n",
" transformer_list=[\n",
" ('features', impute.SimpleImputer(missing_values=np.nan, strategy='mean')),\n",
" ('indicators', Pipeline(\n",
" [\n",
" ('selector', preprocessing.FunctionTransformer(_select_occluded_features, validate=False)),\n",
" ('indicator', impute.MissingIndicator())\n",
" ]\n",
" ))\n",
" ]\n",
" )),\n",
" (\"interactor\", preprocessing.FunctionTransformer(_interact_with_occlusion_indicators, validate=False)),\n",
" (\"estimator\", XGBRegressor(\n",
" booster='gblinear',\n",
" objective='count:poisson',\n",
" base_score=data[label].median(),\n",
" **kwargs\n",
" ))\n",
" ],\n",
" occluded_features = occluded_features\n",
" )\n",
" estimator.features = features\n",
" return estimator"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def fit_estimator_soln2(\n",
" train_set: pd.DataFrame, \n",
" features: List[str], \n",
" label: str, \n",
" occluded_features: List[str]=[\"OverallQual\"], \n",
" **kwargs\n",
") -> BaseEstimator:\n",
" estimator = build_stacked_estimator(features=features, occluded_features=occluded_features, **kwargs)\n",
" return estimator.fit(train_set[estimator.features], train_set[label])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"missing_maes_soln2 = [\n",
" evaluate_with_missing(train_func=fit_estimator_soln2)\n",
" for _ in tqdm(range(NUM_SIMULATIONS), total=NUM_SIMULATIONS) \n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"p = (\n",
" pd.concat(\n",
" {\n",
" \"naive\": pd.DataFrame(missing_maes),\n",
" \"multiple_models\": pd.DataFrame(missing_maes_soln1),\n",
" \"data_stacking\": pd.DataFrame(missing_maes_soln2)\n",
" }, \n",
" names=[\"Solution\"]\n",
" )\n",
" .reset_index(\"Solution\")\n",
" .pipe(pd.melt, var_name=\"percent_missing\", value_name=\"MAE\", id_vars=[\"Solution\"])\n",
" .assign(percent_missing = lambda df: df[\"percent_missing\"].astype(float).round(1).astype(str))\n",
" .pipe(\n",
" lambda df:\n",
" ggplot(df, aes(x=\"percent_missing\", y=\"MAE\", fill=\"Solution\"))\n",
" + geom_violin(draw_quantiles=[.5], scale=\"width\")\n",
" + scale_fill_manual(values=[cbbPalette[i] for i in [3,6,2]])\n",
" )\n",
");\n",
"\n",
"_format_MAE_plot(p, title=\"Data stacking performance is identical to multiple models\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Conclusion"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Appendix"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Other simulations"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Compare preds of two solutions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"occluded_features = [\"OverallQual\"]\n",
"stacked_features = occluded_features + [f for f in features if not f in occluded_features]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"compare_preds = [\n",
" T.pipe(\n",
" np.random.randint(np.iinfo(np.int32).max),\n",
" lambda random_state:\n",
" (\n",
" pd.DataFrame(\n",
" evaluate_with_missing(\n",
" train_func=fit_estimator_soln1,\n",
" eval_func=functools.partial(evaluate_soln1, do_mae=False),\n",
" random_state=random_state,\n",
" features=stacked_features,\n",
" )\n",
" ) -\n",
" pd.DataFrame(\n",
" evaluate_with_missing(\n",
" train_func=fit_estimator_soln2, \n",
" eval_func=functools.partial(evaluate, do_mae=False),\n",
" random_state=random_state,\n",
" features=stacked_features,\n",
" )\n",
" )\n",
" )\n",
" )\n",
" for _ in tqdm(range(NUM_SIMULATIONS), total=NUM_SIMULATIONS)\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"(\n",
" pd.concat(compare_preds)\n",
" .pipe(pd.melt, var_name=\"percent_missing\", value_name=\"solution_1_2_pred_difference\") \n",
" .pipe(\n",
" lambda df:\n",
" ggplot(df, aes(x=\"solution_1_2_pred_difference\", group=\"percent_missing\", color=\"percent_missing\")) \n",
" + stat_ecdf()\n",
" + xlim(-5000, 5000)\n",
" + theme_bw()\n",
" + scale_y_continuous(breaks=[0,.1,.25,.5,.75,.9,1])\n",
" + labs(\n",
" color=\"% `OverallQual` \\n occluded during serving.\",\n",
" y=\"Percentile\",\n",
" x=\"Multiple models - Data stacking prediction difference\"\n",
" )\n",
" )\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Compare coefficients of two solutions\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def compare_coefficients(\n",
" data: pd.DataFrame, estimator_funcs: Tuple[callable, callable], label: str=label, resample: bool=True\n",
") -> pd.DataFrame:\n",
" if resample:\n",
" data = data.sample(frac=1.0, replace=True)\n",
" random_state = np.random.randint(np.iinfo(np.int32).max)\n",
" else:\n",
" random_state = None\n",
"\n",
" def _fitted_coefs(estimator_func: callable) -> pd.Series:\n",
" estimator = estimator_func(random_state=random_state)\n",
" \n",
" return T.pipe(\n",
" estimator.fit(data[estimator.features], data[label]),\n",
" lambda est: pd.concat(\n",
" [\n",
" pd.Series(est.steps[-1][1].coef_), pd.Series(est.named_steps['scaler'].var_)\n",
" ], axis=1\n",
" ),\n",
" lambda df: df[0] / np.sqrt(df[1])\n",
" )\n",
" \n",
" return pd.concat([_fitted_coefs(fun) for fun in estimator_funcs], axis=1).T.pipe(\n",
" lambda df: df.diff().loc[1,:] / df.mean(axis=0)\n",
" ).dropna()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"coefficient_comparisons = [\n",
" compare_coefficients(\n",
" data=data, \n",
" estimator_funcs = [\n",
" lambda random_state: build_stacked_estimator(stacked_features, occluded_features, random_state=random_state),\n",
" lambda random_state: build_estimator(stacked_features, random_state=random_state)\n",
" ]\n",
" )\n",
" for _ in tqdm(range(NUM_SIMULATIONS))\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"(\n",
" pd.concat(coefficient_comparisons, axis=1).T\n",
" .rename(columns={i: f for i,f in enumerate(stacked_features)})\n",
" .pipe(pd.melt, var_name=\"feature\", value_name=\"normalized_coefficient_difference\") \n",
" .pipe(\n",
" lambda df:\n",
" ggplot(df, aes(x=\"normalized_coefficient_difference\")) \n",
" + stat_ecdf()\n",
" + scale_x_continuous(breaks=[-1, -0.5, -0.2, 0, 0.2, 0.5, 1], limits=[-1,1])\n",
" + scale_y_continuous(breaks=[0, 0.1, 0.25, 0.5, 0.75, 0.9, 1.0])\n",
" + theme_bw()\n",
" + labs(\n",
" x = \"Soln 1 - Soln 2 coefficient difference, scaled by average coefficient value\",\n",
" y = \"Percentile\"\n",
" )\n",
" )\n",
")"
]
}
],
"metadata": {
"hide_input": false,
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment