Skip to content

Instantly share code, notes, and snippets.

@jiafengkevinchen
Last active February 12, 2022 07:42
Show Gist options
  • Save jiafengkevinchen/439ae146d235834efa884533d1ff9915 to your computer and use it in GitHub Desktop.
Save jiafengkevinchen/439ae146d235834efa884533d1ff9915 to your computer and use it in GitHub Desktop.
A sample implementation of a sample-split machine learning IV estimator (https://arxiv.org/abs/2011.06158); Google Colab: https://colab.research.google.com/gist/jiafengkevinchen/439ae146d235834efa884533d1ff9915/demo.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import statsmodels.api as sm\n",
"from linearmodels import IV2SLS\n",
"from rpy2.robjects import Formula, pandas2ri\n",
"from rpy2.robjects.packages import importr\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"from statsmodels.tools import add_constant\n",
"\n",
"pandas2ri.activate()"
]
},
{
"cell_type": "code",
"execution_count": 278,
"metadata": {},
"outputs": [],
"source": [
"ivpack = importr(\"ivpack\")\n",
"aer = importr(\"AER\")\n",
"r_base = importr(\"base\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data-generating process\n",
"We have $n=1000$ and a $k=5$ dimensional instrument $W$. The data generating process is\n",
"$$\n",
"Y = D + U \\\\\n",
"D = \\sum_j \\pi_j W_j^2 + V \\\\\n",
"E[U,V|W]=0\n",
"$$\n",
"Note that TSLS with $W$ has population first stage being zero."
]
},
{
"cell_type": "code",
"execution_count": 279,
"metadata": {},
"outputs": [],
"source": [
"def simple_dgp(n, k, rho, pi, seed=2138, linear_weak=True):\n",
" rng = np.random.RandomState(seed)\n",
" w = rng.randn(n, k)\n",
" v = rng.randn(n)\n",
" u = rho * v + (1 - rho ** 2) ** 0.5 * rng.randn(n)\n",
"\n",
" d = (w ** 2) @ pi + v\n",
" y = d + u\n",
" return (y, d, w)"
]
},
{
"cell_type": "code",
"execution_count": 280,
"metadata": {},
"outputs": [],
"source": [
"# DGP Parameters\n",
"np.random.seed(2138)\n",
"n = 1000\n",
"k = 5\n",
"rho = 0.8\n",
"pi = np.random.randn(k)\n",
"\n",
"# Outcome, treatment, instrument\n",
"outcome, treatment, instrument = simple_dgp(n, k, rho, pi)"
]
},
{
"cell_type": "code",
"execution_count": 281,
"metadata": {},
"outputs": [],
"source": [
"# Split data in half\n",
"outcome1, outcome2, treatment1, treatment2, instrument1, instrument2 = train_test_split(\n",
" outcome, treatment, instrument, train_size=0.5\n",
")\n",
"folds = [(outcome1, treatment1, instrument1), (outcome2, treatment2, instrument2)]"
]
},
{
"cell_type": "code",
"execution_count": 282,
"metadata": {},
"outputs": [],
"source": [
"def hyperparameter_search(data, constructor, split=True, **kwargs):\n",
" \"\"\"\n",
" Doing the prediction well would probably require\n",
" a train-test-split-based (or cross validation based) approach to\n",
" do hyperparameter choice\n",
" \"\"\"\n",
" raise NotImplementedError"
]
},
{
"cell_type": "code",
"execution_count": 283,
"metadata": {},
"outputs": [],
"source": [
"def get_ar_interval(df, instrument=\"estimated_inst\"):\n",
" \"\"\"Calculate two Anderson-Rubin confidence intervals\"\"\"\n",
" s1 = ivpack.anderson_rubin_ci(\n",
" aer.ivreg(\n",
" Formula(\"outcome ~ treatment\"),\n",
" Formula(f\"~ {instrument} \"),\n",
" data=df.query(\"fold == 0\"),\n",
" x=True,\n",
" ),\n",
" conflevel=0.975,\n",
" )[0][0]\n",
"\n",
" try:\n",
" s2 = ivpack.anderson_rubin_ci(\n",
" aer.ivreg(\n",
" Formula(\"outcome ~ treatment\"),\n",
" Formula(f\"~ {instrument} \"),\n",
" data=df.query(\"fold == 1\"),\n",
" x=True,\n",
" ),\n",
" conflevel=0.975,\n",
" )[0][0]\n",
" except:\n",
" s2 = None\n",
"\n",
" return s1, s2"
]
},
{
"cell_type": "code",
"execution_count": 284,
"metadata": {},
"outputs": [],
"source": [
"# \"Hyperparameter searching\"\n",
"try:\n",
" clf = hyperparameter_search(\n",
" folds[0],\n",
" RandomForestRegressor,\n",
" split=True,\n",
" max_depth=list(range(2, 10)),\n",
" n_estimators=[60, 80, 100],\n",
" )\n",
"except NotImplementedError:\n",
" clf = RandomForestRegressor(20, max_depth=3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Two-stage least squares\n",
"We are not identified and so the Wald interval here is misleading"
]
},
{
"cell_type": "code",
"execution_count": 287,
"metadata": {},
"outputs": [],
"source": [
"tsls_fit = IV2SLS(\n",
" dependent=outcome,\n",
" exog=np.ones((n, 1)),\n",
" endog=treatment.reshape((n, 1)),\n",
" instruments=instrument,\n",
").fit()"
]
},
{
"cell_type": "code",
"execution_count": 288,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" IV-2SLS Estimation Summary \n",
"==============================================================================\n",
"Dep. Variable: dependent R-squared: 0.9356\n",
"Estimator: IV-2SLS Adj. R-squared: 0.9355\n",
"No. Observations: 1000 F-statistic: 59.126\n",
"Date: Fri, Nov 20 2020 P-value (F-stat) 0.0000\n",
"Time: 16:40:32 Distribution: chi2(1)\n",
"Cov. Estimator: robust \n",
" \n",
" Parameter Estimates \n",
"==============================================================================\n",
" Parameter Std. Err. T-stat P-value Lower CI Upper CI\n",
"------------------------------------------------------------------------------\n",
"exog 0.4320 0.6486 0.6661 0.5054 -0.8392 1.7033\n",
"endog 1.1071 0.1440 7.6894 0.0000 0.8249 1.3893\n",
"==============================================================================\n",
"\n",
"Endogenous: endog\n",
"Instruments: instruments.0, instruments.1, instruments.2, instruments.3, instruments.4\n",
"Robust Covariance (Heteroskedastic)\n",
"Debiased: False\n"
]
}
],
"source": [
"print(tsls_fit)"
]
},
{
"cell_type": "code",
"execution_count": 289,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" First Stage Estimation Results \n",
"======================================\n",
" endog\n",
"--------------------------------------\n",
"R-squared 0.0045\n",
"Partial R-squared 0.0045\n",
"Shea's R-squared 0.0045\n",
"Partial F-statistic 4.2152\n",
"P-value (Partial F-stat) 0.5189\n",
"Partial F-stat Distn chi2(5)\n",
"========================== ===========\n",
"exog -4.5000\n",
" (-42.032)\n",
"instruments.0 0.1272\n",
" (1.0512)\n",
"instruments.1 0.1110\n",
" (0.6953)\n",
"instruments.2 0.0181\n",
" (0.0875)\n",
"instruments.3 0.1127\n",
" (1.0947)\n",
"instruments.4 -0.1025\n",
" (-0.9026)\n",
"--------------------------------------\n",
"\n",
"T-stats reported in parentheses\n",
"T-stats use same covariance type as original model\n"
]
}
],
"source": [
"print(tsls_fit.first_stage)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ols_fit = sm.OLS(endog=folds[0][1], exog=add_constant(folds[0][2])).fit()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Predictably, out of sample R^2 is nonexistent\n",
"def calculate_r2(predicted, truth):\n",
" \"\"\"Calculate R^2 of predicted and truth with 1 - (truth - predicted)^2 / truth.std()^2\"\"\"\n",
" return 1 - ((truth - predicted) ** 2).mean() / truth.std() ** 2\n",
"\n",
"\n",
"calculate_r2(ols_fit.predict(add_constant(folds[1][2])), folds[1][1])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Anderson-Rubin in TSLS\n",
"Indeed, the Anderson-Rubin interval is $\\mathbb R$"
]
},
{
"cell_type": "code",
"execution_count": 294,
"metadata": {},
"outputs": [],
"source": [
"df = pd.concat(\n",
" [\n",
" pd.DataFrame(\n",
" np.vstack([c.reshape((-1, n // 2)) for c in folds[0]]).T,\n",
" columns=[\"outcome\", \"treatment\"] + [f\"instrument_{i}\" for i in range(k)],\n",
" ).assign(fold=0),\n",
" pd.DataFrame(\n",
" np.vstack([c.reshape((-1, n // 2)) for c in folds[1]]).T,\n",
" columns=[\"outcome\", \"treatment\"] + [f\"instrument_{i}\" for i in range(k)],\n",
" ).assign(fold=1),\n",
" ],\n",
" axis=0,\n",
").reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 295,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"R[write to console]: Error in lm.fit(z, x, ...) : 0 (non-NA) cases\n",
"Calls: <Anonymous> -> ivreg.fit -> lm.fit\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Whole Real Line\n"
]
}
],
"source": [
"# Get AR interval on the whole data\n",
"interval, _ = get_ar_interval(\n",
" df.assign(fold=0), instrument=\" + \".join([f\"instrument_{i}\" for i in range(k)])\n",
")\n",
"print(interval)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# MLSS"
]
},
{
"cell_type": "code",
"execution_count": 296,
"metadata": {},
"outputs": [],
"source": [
"def fit_predict_clf(clf, fold, predict_fold):\n",
" \"\"\"Fit clf on fold and predict on predict_fold\"\"\"\n",
" outcome, treatment, instrument = fold\n",
" clf.fit(instrument, treatment)\n",
" return clf.predict(predict_fold[2])\n",
"\n",
"\n",
"def two_fold_cross_fit(folds):\n",
" \"\"\"Perform cross-fitting\"\"\"\n",
" fold1, fold2 = folds\n",
"\n",
" predicted2 = fit_predict_clf(clf, fold1, fold2)\n",
" predicted1 = fit_predict_clf(clf, fold2, fold1)\n",
"\n",
" return (\n",
" predicted1,\n",
" predicted2,\n",
" calculate_r2(predicted1, fold1[1]),\n",
" calculate_r2(predicted2, fold2[1]),\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 298,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.4784353278761917 0.47688073813216414\n"
]
}
],
"source": [
"# Out of sample R^2 is reasonably high\n",
"predicted1, predicted2, rsq1, rsq2 = two_fold_cross_fit(folds)\n",
"print(rsq1, rsq2)"
]
},
{
"cell_type": "code",
"execution_count": 299,
"metadata": {},
"outputs": [],
"source": [
"df[\"estimated_inst\"] = np.hstack([predicted1, predicted2])"
]
},
{
"cell_type": "code",
"execution_count": 300,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ 0.974303823236021 , 1.04487327033638 ] [ 0.967955093666247 , 1.05656363389298 ]\n"
]
}
],
"source": [
"# The AR interval is the intersection of the two 97.5% AR intervals\n",
"int1, int2 = get_ar_interval(df)\n",
"print(int1, int2)"
]
}
],
"metadata": {
"jupytext": {
"formats": "ipynb,py:percent"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment