Skip to content

Instantly share code, notes, and snippets.

@koaning
Last active June 28, 2020 19:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save koaning/1a606e07c9b27d1889ada4185959d87a to your computer and use it in GitHub Desktop.
Save koaning/1a606e07c9b27d1889ada4185959d87a to your computer and use it in GitHub Desktop.
benchmark
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"import gif\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pylab as plt\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.neighbors import KNeighborsRegressor\n",
"from sklearn.pipeline import Pipeline"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.datasets import load_boston"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"X, y = load_boston(return_X_y=True)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"jupyter": {
"source_hidden": true
}
},
"outputs": [],
"source": [
"# Copyright (c) Microsoft Corporation and contributors.\n",
"# Licensed under the MIT License.\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.base import BaseEstimator, TransformerMixin\n",
"from sklearn.utils import check_array\n",
"from sklearn.utils.validation import check_is_fitted\n",
"\n",
"\n",
"class CorrelationRemover(BaseEstimator, TransformerMixin):\n",
" r\"\"\"\n",
" A component that filters out sensitive correlations in a dataset.\n",
"\n",
" CorrelationRemover applies a linear transformation to the non-sensitive feature columns in order\n",
" to remove their correlation with the sensitive feature columns while retaining as much information\n",
" as possible (as measured by the least-squares error).\n",
"\n",
" Parameters\n",
" ----------\n",
" sensitive_feature_ids : list of columns to filter out this can be a sequence of either int\n",
" ,in the case of numpy, or string, in the case of pandas.\n",
" alpha : parameter to control how much to filter, for alpha=1.0 we filter out\n",
" all information while for alpha=0.0 we don't apply any.\n",
" center : setting to tell if this preprocessing step should center the data for\n",
" numerical stability\n",
"\n",
" Notes\n",
" -----\n",
"\n",
" This method will change the original dataset by removing all correlation with sensitive values.\n",
" To describe that mathematically, let's assume in the original dataset :math:`X` we've got a set of\n",
" sensitive atttributes :math:`S` and a set of non-sensitive attributes :math:`Z`. Mathmatically this method\n",
" will be solving the following problem.\n",
"\n",
" .. math::\n",
"\n",
" \\min _{\\mathbf{z}_{1}, \\ldots, \\mathbf{z}_{n}} \\sum_{i=1}^{n}\\left\\|\\mathbf{z}_{i}-\\mathbf{x}_{i}\\right\\|^{2} \\\\\n",
" \\text{subject to} \\\\\n",
" \\frac{1}{n} \\sum_{i=1}^{n} \\mathbf{z}_{i}\\left(\\mathbf{s}_{i}-\\overline{\\mathbf{s}}\\right)^{T}=\\mathbf{0}\n",
"\n",
"\n",
" The solution to this problem is found by centering sensitive features, fitting a\n",
" linear regression model to the non-sensitive features and reporting the residual.\n",
"\n",
" The columns in :math:`S` will be dropped but the hyper parameter :math:`\\alpha` does allow you to tweak\n",
" the amount of filtering that gets applied.\n",
"\n",
" .. math::\n",
"\n",
" X_{\\text{tfm}} = \\alpha X_{\\text{filtered}} + (1-\\alpha) X_{\\text{orig}}\n",
" \"\"\"\n",
"\n",
" def __init__(self, *, sensitive_feature_ids=None, alpha=1.0, center=True):\n",
" self.columns = sensitive_feature_ids\n",
" self.alpha = alpha\n",
" self.center = center\n",
"\n",
" def _split_X(self, X):\n",
" \"\"\"Split up X into a sensitive and non-sensitive group.\"\"\"\n",
" if isinstance(X, pd.DataFrame):\n",
" sens_df = X[self.columns]\n",
" non_sens_df = X[[c for c in X.columns if c not in self.columns]]\n",
" return sens_df.values, non_sens_df.values\n",
" non_sensitive = [i for i in range(X.shape[1]) if i not in self.columns]\n",
" return X[:, non_sensitive], X[:, self.columns]\n",
"\n",
" def fit(self, X, y=None):\n",
" \"\"\"Learn the projection required to make the dataset orthogonal to sensitive columns.\"\"\"\n",
" X = check_array(X, estimator=self, force_all_finite=True)\n",
" if (not self.columns) or (len(self.columns) == 0):\n",
" raise ValueError(f\"No sensitive feature ids were passed to this object, got {self.columns}\")\n",
" X_use, X_sensitive = self._split_X(X)\n",
" self.sensitive_mean_ = X_sensitive.mean()\n",
" X_s_center = X_sensitive - self.sensitive_mean_\n",
" self.beta_, _, _, _ = np.linalg.lstsq(X_s_center, X_use, rcond=None)\n",
" self.X_shape_ = X.shape\n",
" return self\n",
"\n",
" def transform(self, X):\n",
" \"\"\"Transform X by applying the information filter.\"\"\"\n",
" X = check_array(X, estimator=self, dtype=None, force_all_finite=True)\n",
" check_is_fitted(self, [\"beta_\", \"X_shape_\", \"sensitive_mean_\"])\n",
" if self.X_shape_[1] != X.shape[1]:\n",
" raise ValueError(f\"The trained data has {self.X_shape_[1]} while this dataset has {X.shape[1]}.\")\n",
" X_use, X_sensitive = self._split_X(X)\n",
" X_s_center = X_sensitive - self.sensitive_mean_\n",
" X_filtered = X_use - X_s_center.dot(self.beta_)\n",
" X_use = np.atleast_2d(X_use)\n",
" X_filtered = np.atleast_2d(X_filtered)\n",
" return self.alpha * X_filtered + (1 - self.alpha) * X_use\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.base import BaseEstimator, TransformerMixin\n",
"from sklearn.utils import check_array\n",
"from sklearn.utils.validation import check_is_fitted\n",
"\n",
"from sklego.common import as_list\n",
"\n",
"\n",
"def scalar_projection(vec, unto):\n",
" return vec.dot(unto) / unto.dot(unto)\n",
"\n",
"\n",
"def vector_projection(vec, unto):\n",
" return scalar_projection(vec, unto) * unto\n",
"\n",
"\n",
"class InformationFilter(BaseEstimator, TransformerMixin):\n",
" \"\"\"\n",
" The `InformationFilter` uses a variant of the gram smidt process\n",
" to filter information out of the dataset. This can be useful if you\n",
" want to filter information out of a dataset because of fairness.\n",
" To explain how it works: given a training matrix :math:`X` that contains\n",
" columns :math:`x_1, ..., x_k`. If we assume columns :math:`x_1` and :math:`x_2`\n",
" to be the sensitive columns then the information-filter will\n",
" remove information by applying these transformations;\n",
" .. math::\n",
" \\\\begin{split}\n",
" v_1 & = x_1 \\\\\\\\\n",
" v_2 & = x_2 - \\\\frac{x_2 v_1}{v_1 v_1}\\\\\\\\\n",
" v_3 & = x_3 - \\\\frac{x_k v_1}{v_1 v_1} - \\\\frac{x_2 v_2}{v_2 v_2}\\\\\\\\\n",
" ... \\\\\\\\\n",
" v_k & = x_k - \\\\frac{x_k v_1}{v_1 v_1} - \\\\frac{x_2 v_2}{v_2 v_2}\n",
" \\\\end{split}\n",
" Concatenating our vectors (but removing the sensitive ones) gives us\n",
" a new training matrix :math:`X_{fair} = [v_3, ..., v_k]`.\n",
" :param columns: the columns to filter out this can be a sequence of either int\n",
" (in the case of numpy) or string (in the case of pandas).\n",
" :param alpha: parameter to control how much to filter, for alpha=1 we filter out\n",
" all information while for alpha=0 we don't apply any.\n",
" \"\"\"\n",
"\n",
" def __init__(self, columns, alpha=1):\n",
" self.columns = columns\n",
" self.alpha = alpha\n",
"\n",
" def _check_coltype(self, X):\n",
" for col in as_list(self.columns):\n",
" if isinstance(col, str):\n",
" if isinstance(X, np.ndarray):\n",
" raise ValueError(\n",
" f\"column {col} is a string but datatype receive is numpy.\"\n",
" )\n",
" if isinstance(X, pd.DataFrame):\n",
" if col not in X.columns:\n",
" raise ValueError(f\"column {col} is not in {X.columns}\")\n",
" if isinstance(col, int):\n",
" if col not in range(np.atleast_2d(np.array(X)).shape[1]):\n",
" raise ValueError(\n",
" f\"column {col} is out of bounds for input shape {X.shape}\"\n",
" )\n",
"\n",
" def _col_idx(self, X, name):\n",
" if isinstance(name, str):\n",
" if isinstance(X, np.ndarray):\n",
" raise ValueError(\n",
" \"You cannot have a column of type string on a numpy input matrix.\"\n",
" )\n",
" return {name: i for i, name in enumerate(X.columns)}[name]\n",
" return name\n",
"\n",
" def _make_v_vectors(self, X, col_ids):\n",
" vs = np.zeros((X.shape[0], len(col_ids)))\n",
" for i, c in enumerate(col_ids):\n",
" vs[:, i] = X[:, col_ids[i]]\n",
" for j in range(0, i):\n",
" vs[:, i] = vs[:, i] - vector_projection(vs[:, i], vs[:, j])\n",
" return vs\n",
"\n",
" def fit(self, X, y=None):\n",
" \"\"\"Learn the projection required to make the dataset orthogonal to sensitive columns.\"\"\"\n",
" self._check_coltype(X)\n",
" self.col_ids_ = [\n",
" v if isinstance(v, int) else self._col_idx(X, v)\n",
" for v in as_list(self.columns)\n",
" ]\n",
" X = check_array(X, estimator=self)\n",
" X_fair = X.copy()\n",
" v_vectors = self._make_v_vectors(X, self.col_ids_)\n",
" # gram smidt process but only on sensitive attributes\n",
" for i, col in enumerate(X_fair.T):\n",
" for v in v_vectors.T:\n",
" X_fair[:, i] = X_fair[:, i] - vector_projection(X_fair[:, i], v)\n",
" # we want to learn matrix P: X P = X_fair\n",
" # this means we first need to create X_fair in order to learn P\n",
" self.projection_, resid, rank, s = np.linalg.lstsq(X, X_fair, rcond=None)\n",
" return self\n",
"\n",
" def transform(self, X):\n",
" \"\"\"Transforms X by applying the information filter.\"\"\"\n",
" check_is_fitted(self, [\"projection_\", \"col_ids_\"])\n",
" self._check_coltype(X)\n",
" X = check_array(X, estimator=self)\n",
" # apply the projection and remove the column we won't need\n",
" X_fair = X @ self.projection_\n",
" X_removed = np.delete(X_fair, self.col_ids_, axis=1)\n",
" X_orig = np.delete(X, self.col_ids_, axis=1)\n",
" return self.alpha * np.atleast_2d(X_removed) + (1 - self.alpha) * np.atleast_2d(\n",
" X_orig\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"@gif.frame\n",
"def frame(alpha, filter_method):\n",
" remover = filter_method\n",
" pipe = Pipeline([\n",
" ('filter', remover),\n",
" ('mod', LinearRegression())\n",
" ])\n",
" pipe.fit(X, y)\n",
" pred = pipe.predict(X)\n",
" plt.figure(figsize=(12, 4))\n",
"\n",
" plt.subplot(131)\n",
" plt.scatter(pred, y)\n",
" plt.title(f'pred vs. actual alpha={alpha}')\n",
"\n",
" plt.subplot(132)\n",
" c11_upper, c11_lower = pred[X[:, 11] > X[:, 11].mean()], pred[X[:, 11] <= X[:, 11].mean()]\n",
" plt.hist(c11_lower, bins=30, density=True, alpha=0.5, label='lower')\n",
" plt.hist(c11_upper, bins=30, density=True, alpha=0.5, label='upper')\n",
" plt.legend()\n",
" plt.title('predictions split on col B')\n",
"\n",
" plt.subplot(133)\n",
" c12_upper, c12_lower = pred[X[:, 12] > X[:, 12].mean()], pred[X[:, 12] <= X[:, 12].mean()]\n",
" plt.hist(c12_lower, bins=30, density=True, alpha=0.5, label='lower')\n",
" plt.hist(c12_upper, bins=30, density=True, alpha=0.5, label='upper')\n",
" plt.legend()\n",
" plt.title('predictions split on col LSTAT')\n",
" data.append([filter_method.__class__.__name__, \n",
" alpha, \n",
" c11_upper.mean() - c11_lower.mean(), \n",
" c12_upper.mean() - c12_lower.mean()]);"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"data = []\n",
"frames = [frame(alpha=np.round(i, 2), filter_method=InformationFilter(columns=[11, 12], alpha=np.round(i, 2))) \n",
" for i in np.linspace(-3.0, 2.0, 51)]\n",
"gif.save(frames, \"boston_filter.gif\", duration=400)\n",
"frames = [frame(alpha=np.round(i, 2), filter_method=CorrelationRemover(sensitive_feature_ids=[11, 12], alpha=np.round(i, 2))) \n",
" for i in np.linspace(-3.0, 2.0, 51)]\n",
"gif.save(frames, \"boston_corr.gif\", duration=400)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"# from IPython.display import Image\n",
"# Image(\"boston_filter.gif\")"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"# from IPython.display import Image\n",
"# Image(\"boston_corr.gif\")"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"import plotnine as p9"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"run_df = pd.DataFrame(data, columns=['method', 'alpha', 'c11_diff', 'c12_diff'])"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"<ggplot: (322863985)>"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(p9.ggplot() + \n",
" p9.geom_line(data=run_df, mapping=p9.aes('alpha','c11_diff', color='method')) + \n",
" p9.geom_point(data=run_df, mapping=p9.aes('alpha','c12_diff', color='method')))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment