jiafengkevinchen/demo.ipynb

## demo.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import statsmodels.api as sm\n",
    "from linearmodels import IV2SLS\n",
    "from rpy2.robjects import Formula, pandas2ri\n",
    "from rpy2.robjects.packages import importr\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "from statsmodels.tools import add_constant\n",
    "\n",
    "pandas2ri.activate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 278,
   "metadata": {},
   "outputs": [],
   "source": [
    "ivpack = importr(\"ivpack\")\n",
    "aer = importr(\"AER\")\n",
    "r_base = importr(\"base\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data-generating process\n",
    "We have $n=1000$ and a $k=5$ dimensional instrument $W$. The data generating process is\n",
    "$$\n",
    "Y = D + U \\\\\n",
    "D = \\sum_j \\pi_j W_j^2  + V  \\\\\n",
    "E[U,V|W]=0\n",
    "$$\n",
    "Note that TSLS with $W$ has population first stage being zero."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 279,
   "metadata": {},
   "outputs": [],
   "source": [
    "def simple_dgp(n, k, rho, pi, seed=2138, linear_weak=True):\n",
    "    rng = np.random.RandomState(seed)\n",
    "    w = rng.randn(n, k)\n",
    "    v = rng.randn(n)\n",
    "    u = rho * v + (1 - rho ** 2) ** 0.5 * rng.randn(n)\n",
    "\n",
    "    d = (w ** 2) @ pi + v\n",
    "    y = d + u\n",
    "    return (y, d, w)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 280,
   "metadata": {},
   "outputs": [],
   "source": [
    "# DGP Parameters\n",
    "np.random.seed(2138)\n",
    "n = 1000\n",
    "k = 5\n",
    "rho = 0.8\n",
    "pi = np.random.randn(k)\n",
    "\n",
    "# Outcome, treatment, instrument\n",
    "outcome, treatment, instrument = simple_dgp(n, k, rho, pi)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 281,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split data in half\n",
    "outcome1, outcome2, treatment1, treatment2, instrument1, instrument2 = train_test_split(\n",
    "    outcome, treatment, instrument, train_size=0.5\n",
    ")\n",
    "folds = [(outcome1, treatment1, instrument1), (outcome2, treatment2, instrument2)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 282,
   "metadata": {},
   "outputs": [],
   "source": [
    "def hyperparameter_search(data, constructor, split=True, **kwargs):\n",
    "    \"\"\"\n",
    "    Doing the prediction well would probably require\n",
    "    a train-test-split-based (or cross validation based) approach to\n",
    "    do hyperparameter choice\n",
    "    \"\"\"\n",
    "    raise NotImplementedError"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 283,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_ar_interval(df, instrument=\"estimated_inst\"):\n",
    "    \"\"\"Calculate two Anderson-Rubin confidence intervals\"\"\"\n",
    "    s1 = ivpack.anderson_rubin_ci(\n",
    "        aer.ivreg(\n",
    "            Formula(\"outcome ~ treatment\"),\n",
    "            Formula(f\"~ {instrument} \"),\n",
    "            data=df.query(\"fold == 0\"),\n",
    "            x=True,\n",
    "        ),\n",
    "        conflevel=0.975,\n",
    "    )[0][0]\n",
    "\n",
    "    try:\n",
    "        s2 = ivpack.anderson_rubin_ci(\n",
    "            aer.ivreg(\n",
    "                Formula(\"outcome ~ treatment\"),\n",
    "                Formula(f\"~ {instrument} \"),\n",
    "                data=df.query(\"fold == 1\"),\n",
    "                x=True,\n",
    "            ),\n",
    "            conflevel=0.975,\n",
    "        )[0][0]\n",
    "    except:\n",
    "        s2 = None\n",
    "\n",
    "    return s1, s2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 284,
   "metadata": {},
   "outputs": [],
   "source": [
    "# \"Hyperparameter searching\"\n",
    "try:\n",
    "    clf = hyperparameter_search(\n",
    "        folds[0],\n",
    "        RandomForestRegressor,\n",
    "        split=True,\n",
    "        max_depth=list(range(2, 10)),\n",
    "        n_estimators=[60, 80, 100],\n",
    "    )\n",
    "except NotImplementedError:\n",
    "    clf = RandomForestRegressor(20, max_depth=3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Two-stage least squares\n",
    "We are not identified and so the Wald interval here is misleading"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 287,
   "metadata": {},
   "outputs": [],
   "source": [
    "tsls_fit = IV2SLS(\n",
    "    dependent=outcome,\n",
    "    exog=np.ones((n, 1)),\n",
    "    endog=treatment.reshape((n, 1)),\n",
    "    instruments=instrument,\n",
    ").fit()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 288,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                          IV-2SLS Estimation Summary                          \n",
      "==============================================================================\n",
      "Dep. Variable:              dependent   R-squared:                      0.9356\n",
      "Estimator:                    IV-2SLS   Adj. R-squared:                 0.9355\n",
      "No. Observations:                1000   F-statistic:                    59.126\n",
      "Date:                Fri, Nov 20 2020   P-value (F-stat)                0.0000\n",
      "Time:                        16:40:32   Distribution:                  chi2(1)\n",
      "Cov. Estimator:                robust                                         \n",
      "                                                                              \n",
      "                             Parameter Estimates                              \n",
      "==============================================================================\n",
      "            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI\n",
      "------------------------------------------------------------------------------\n",
      "exog           0.4320     0.6486     0.6661     0.5054     -0.8392      1.7033\n",
      "endog          1.1071     0.1440     7.6894     0.0000      0.8249      1.3893\n",
      "==============================================================================\n",
      "\n",
      "Endogenous: endog\n",
      "Instruments: instruments.0, instruments.1, instruments.2, instruments.3, instruments.4\n",
      "Robust Covariance (Heteroskedastic)\n",
      "Debiased: False\n"
     ]
    }
   ],
   "source": [
    "print(tsls_fit)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 289,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    First Stage Estimation Results    \n",
      "======================================\n",
      "                                 endog\n",
      "--------------------------------------\n",
      "R-squared                       0.0045\n",
      "Partial R-squared               0.0045\n",
      "Shea's R-squared                0.0045\n",
      "Partial F-statistic             4.2152\n",
      "P-value (Partial F-stat)        0.5189\n",
      "Partial F-stat Distn           chi2(5)\n",
      "========================== ===========\n",
      "exog                           -4.5000\n",
      "                             (-42.032)\n",
      "instruments.0                   0.1272\n",
      "                              (1.0512)\n",
      "instruments.1                   0.1110\n",
      "                              (0.6953)\n",
      "instruments.2                   0.0181\n",
      "                              (0.0875)\n",
      "instruments.3                   0.1127\n",
      "                              (1.0947)\n",
      "instruments.4                  -0.1025\n",
      "                             (-0.9026)\n",
      "--------------------------------------\n",
      "\n",
      "T-stats reported in parentheses\n",
      "T-stats use same covariance type as original model\n"
     ]
    }
   ],
   "source": [
    "print(tsls_fit.first_stage)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ols_fit = sm.OLS(endog=folds[0][1], exog=add_constant(folds[0][2])).fit()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Predictably, out of sample R^2 is nonexistent\n",
    "def calculate_r2(predicted, truth):\n",
    "    \"\"\"Calculate R^2 of predicted and truth with 1 - (truth - predicted)^2 / truth.std()^2\"\"\"\n",
    "    return 1 - ((truth - predicted) ** 2).mean() / truth.std() ** 2\n",
    "\n",
    "\n",
    "calculate_r2(ols_fit.predict(add_constant(folds[1][2])), folds[1][1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Anderson-Rubin in TSLS\n",
    "Indeed, the Anderson-Rubin interval is $\\mathbb R$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 294,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.concat(\n",
    "    [\n",
    "        pd.DataFrame(\n",
    "            np.vstack([c.reshape((-1, n // 2)) for c in folds[0]]).T,\n",
    "            columns=[\"outcome\", \"treatment\"] + [f\"instrument_{i}\" for i in range(k)],\n",
    "        ).assign(fold=0),\n",
    "        pd.DataFrame(\n",
    "            np.vstack([c.reshape((-1, n // 2)) for c in folds[1]]).T,\n",
    "            columns=[\"outcome\", \"treatment\"] + [f\"instrument_{i}\" for i in range(k)],\n",
    "        ).assign(fold=1),\n",
    "    ],\n",
    "    axis=0,\n",
    ").reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 295,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "R[write to console]: Error in lm.fit(z, x, ...) : 0 (non-NA) cases\n",
      "Calls: <Anonymous> -> ivreg.fit -> lm.fit\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Whole Real Line\n"
     ]
    }
   ],
   "source": [
    "# Get AR interval on the whole data\n",
    "interval, _ = get_ar_interval(\n",
    "    df.assign(fold=0), instrument=\" + \".join([f\"instrument_{i}\" for i in range(k)])\n",
    ")\n",
    "print(interval)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# MLSS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 296,
   "metadata": {},
   "outputs": [],
   "source": [
    "def fit_predict_clf(clf, fold, predict_fold):\n",
    "    \"\"\"Fit clf on fold and predict on predict_fold\"\"\"\n",
    "    outcome, treatment, instrument = fold\n",
    "    clf.fit(instrument, treatment)\n",
    "    return clf.predict(predict_fold[2])\n",
    "\n",
    "\n",
    "def two_fold_cross_fit(folds):\n",
    "    \"\"\"Perform cross-fitting\"\"\"\n",
    "    fold1, fold2 = folds\n",
    "\n",
    "    predicted2 = fit_predict_clf(clf, fold1, fold2)\n",
    "    predicted1 = fit_predict_clf(clf, fold2, fold1)\n",
    "\n",
    "    return (\n",
    "        predicted1,\n",
    "        predicted2,\n",
    "        calculate_r2(predicted1, fold1[1]),\n",
    "        calculate_r2(predicted2, fold2[1]),\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 298,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.4784353278761917 0.47688073813216414\n"
     ]
    }
   ],
   "source": [
    "# Out of sample R^2 is reasonably high\n",
    "predicted1, predicted2, rsq1, rsq2 = two_fold_cross_fit(folds)\n",
    "print(rsq1, rsq2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 299,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[\"estimated_inst\"] = np.hstack([predicted1, predicted2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 300,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 0.974303823236021 , 1.04487327033638 ] [ 0.967955093666247 , 1.05656363389298 ]\n"
     ]
    }
   ],
   "source": [
    "# The AR interval is the intersection of the two 97.5% AR intervals\n",
    "int1, int2 = get_ar_interval(df)\n",
    "print(int1, int2)"
   ]
  }
 ],
 "metadata": {
  "jupytext": {
   "formats": "ipynb,py:percent"
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import pandas as pd\n",
	"import statsmodels.api as sm\n",
	"from linearmodels import IV2SLS\n",
	"from rpy2.robjects import Formula, pandas2ri\n",
	"from rpy2.robjects.packages import importr\n",
	"from sklearn.ensemble import RandomForestRegressor\n",
	"\n",
	"from sklearn.model_selection import train_test_split\n",
	"from statsmodels.tools import add_constant\n",
	"\n",
	"pandas2ri.activate()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 278,
	"metadata": {},
	"outputs": [],
	"source": [
	"ivpack = importr(\"ivpack\")\n",
	"aer = importr(\"AER\")\n",
	"r_base = importr(\"base\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Data-generating process\n",
	"We have $n=1000$ and a $k=5$ dimensional instrument $W$. The data generating process is\n",
	"$$\n",
	"Y = D + U \\\\\n",
	"D = \\sum_j \\pi_j W_j^2 + V \\\\\n",
	"E[U,V\|W]=0\n",
	"$$\n",
	"Note that TSLS with $W$ has population first stage being zero."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 279,
	"metadata": {},
	"outputs": [],
	"source": [
	"def simple_dgp(n, k, rho, pi, seed=2138, linear_weak=True):\n",
	" rng = np.random.RandomState(seed)\n",
	" w = rng.randn(n, k)\n",
	" v = rng.randn(n)\n",
	" u = rho * v + (1 - rho 2) 0.5 * rng.randn(n)\n",
	"\n",
	" d = (w ** 2) @ pi + v\n",
	" y = d + u\n",
	" return (y, d, w)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 280,
	"metadata": {},
	"outputs": [],
	"source": [
	"# DGP Parameters\n",
	"np.random.seed(2138)\n",
	"n = 1000\n",
	"k = 5\n",
	"rho = 0.8\n",
	"pi = np.random.randn(k)\n",
	"\n",
	"# Outcome, treatment, instrument\n",
	"outcome, treatment, instrument = simple_dgp(n, k, rho, pi)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 281,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Split data in half\n",
	"outcome1, outcome2, treatment1, treatment2, instrument1, instrument2 = train_test_split(\n",
	" outcome, treatment, instrument, train_size=0.5\n",
	")\n",
	"folds = [(outcome1, treatment1, instrument1), (outcome2, treatment2, instrument2)]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 282,
	"metadata": {},
	"outputs": [],
	"source": [
	"def hyperparameter_search(data, constructor, split=True, **kwargs):\n",
	" \"\"\"\n",
	" Doing the prediction well would probably require\n",
	" a train-test-split-based (or cross validation based) approach to\n",
	" do hyperparameter choice\n",
	" \"\"\"\n",
	" raise NotImplementedError"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 283,
	"metadata": {},
	"outputs": [],
	"source": [
	"def get_ar_interval(df, instrument=\"estimated_inst\"):\n",
	" \"\"\"Calculate two Anderson-Rubin confidence intervals\"\"\"\n",
	" s1 = ivpack.anderson_rubin_ci(\n",
	" aer.ivreg(\n",
	" Formula(\"outcome ~ treatment\"),\n",
	" Formula(f\"~ {instrument} \"),\n",
	" data=df.query(\"fold == 0\"),\n",
	" x=True,\n",
	" ),\n",
	" conflevel=0.975,\n",
	" )[0][0]\n",
	"\n",
	" try:\n",
	" s2 = ivpack.anderson_rubin_ci(\n",
	" aer.ivreg(\n",
	" Formula(\"outcome ~ treatment\"),\n",
	" Formula(f\"~ {instrument} \"),\n",
	" data=df.query(\"fold == 1\"),\n",
	" x=True,\n",
	" ),\n",
	" conflevel=0.975,\n",
	" )[0][0]\n",
	" except:\n",
	" s2 = None\n",
	"\n",
	" return s1, s2"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 284,
	"metadata": {},
	"outputs": [],
	"source": [
	"# \"Hyperparameter searching\"\n",
	"try:\n",
	" clf = hyperparameter_search(\n",
	" folds[0],\n",
	" RandomForestRegressor,\n",
	" split=True,\n",
	" max_depth=list(range(2, 10)),\n",
	" n_estimators=[60, 80, 100],\n",
	" )\n",
	"except NotImplementedError:\n",
	" clf = RandomForestRegressor(20, max_depth=3)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Two-stage least squares\n",
	"We are not identified and so the Wald interval here is misleading"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 287,
	"metadata": {},
	"outputs": [],
	"source": [
	"tsls_fit = IV2SLS(\n",
	" dependent=outcome,\n",
	" exog=np.ones((n, 1)),\n",
	" endog=treatment.reshape((n, 1)),\n",
	" instruments=instrument,\n",
	").fit()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 288,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" IV-2SLS Estimation Summary \n",
	"==============================================================================\n",
	"Dep. Variable: dependent R-squared: 0.9356\n",
	"Estimator: IV-2SLS Adj. R-squared: 0.9355\n",
	"No. Observations: 1000 F-statistic: 59.126\n",
	"Date: Fri, Nov 20 2020 P-value (F-stat) 0.0000\n",
	"Time: 16:40:32 Distribution: chi2(1)\n",
	"Cov. Estimator: robust \n",
	" \n",
	" Parameter Estimates \n",
	"==============================================================================\n",
	" Parameter Std. Err. T-stat P-value Lower CI Upper CI\n",
	"------------------------------------------------------------------------------\n",
	"exog 0.4320 0.6486 0.6661 0.5054 -0.8392 1.7033\n",
	"endog 1.1071 0.1440 7.6894 0.0000 0.8249 1.3893\n",
	"==============================================================================\n",
	"\n",
	"Endogenous: endog\n",
	"Instruments: instruments.0, instruments.1, instruments.2, instruments.3, instruments.4\n",
	"Robust Covariance (Heteroskedastic)\n",
	"Debiased: False\n"
	]
	}
	],
	"source": [
	"print(tsls_fit)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 289,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" First Stage Estimation Results \n",
	"======================================\n",
	" endog\n",
	"--------------------------------------\n",
	"R-squared 0.0045\n",
	"Partial R-squared 0.0045\n",
	"Shea's R-squared 0.0045\n",
	"Partial F-statistic 4.2152\n",
	"P-value (Partial F-stat) 0.5189\n",
	"Partial F-stat Distn chi2(5)\n",
	"========================== ===========\n",
	"exog -4.5000\n",
	" (-42.032)\n",
	"instruments.0 0.1272\n",
	" (1.0512)\n",
	"instruments.1 0.1110\n",
	" (0.6953)\n",
	"instruments.2 0.0181\n",
	" (0.0875)\n",
	"instruments.3 0.1127\n",
	" (1.0947)\n",
	"instruments.4 -0.1025\n",
	" (-0.9026)\n",
	"--------------------------------------\n",
	"\n",
	"T-stats reported in parentheses\n",
	"T-stats use same covariance type as original model\n"
	]
	}
	],
	"source": [
	"print(tsls_fit.first_stage)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"ols_fit = sm.OLS(endog=folds[0][1], exog=add_constant(folds[0][2])).fit()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Predictably, out of sample R^2 is nonexistent\n",
	"def calculate_r2(predicted, truth):\n",
	" \"\"\"Calculate R^2 of predicted and truth with 1 - (truth - predicted)^2 / truth.std()^2\"\"\"\n",
	" return 1 - ((truth - predicted) 2).mean() / truth.std() 2\n",
	"\n",
	"\n",
	"calculate_r2(ols_fit.predict(add_constant(folds[1][2])), folds[1][1])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Anderson-Rubin in TSLS\n",
	"Indeed, the Anderson-Rubin interval is $\\mathbb R$"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 294,
	"metadata": {},
	"outputs": [],
	"source": [
	"df = pd.concat(\n",
	" [\n",
	" pd.DataFrame(\n",
	" np.vstack([c.reshape((-1, n // 2)) for c in folds[0]]).T,\n",
	" columns=[\"outcome\", \"treatment\"] + [f\"instrument_{i}\" for i in range(k)],\n",
	" ).assign(fold=0),\n",
	" pd.DataFrame(\n",
	" np.vstack([c.reshape((-1, n // 2)) for c in folds[1]]).T,\n",
	" columns=[\"outcome\", \"treatment\"] + [f\"instrument_{i}\" for i in range(k)],\n",
	" ).assign(fold=1),\n",
	" ],\n",
	" axis=0,\n",
	").reset_index(drop=True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 295,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"R[write to console]: Error in lm.fit(z, x, ...) : 0 (non-NA) cases\n",
	"Calls: <Anonymous> -> ivreg.fit -> lm.fit\n",
	"\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Whole Real Line\n"
	]
	}
	],
	"source": [
	"# Get AR interval on the whole data\n",
	"interval, _ = get_ar_interval(\n",
	" df.assign(fold=0), instrument=\" + \".join([f\"instrument_{i}\" for i in range(k)])\n",
	")\n",
	"print(interval)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# MLSS"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 296,
	"metadata": {},
	"outputs": [],
	"source": [
	"def fit_predict_clf(clf, fold, predict_fold):\n",
	" \"\"\"Fit clf on fold and predict on predict_fold\"\"\"\n",
	" outcome, treatment, instrument = fold\n",
	" clf.fit(instrument, treatment)\n",
	" return clf.predict(predict_fold[2])\n",
	"\n",
	"\n",
	"def two_fold_cross_fit(folds):\n",
	" \"\"\"Perform cross-fitting\"\"\"\n",
	" fold1, fold2 = folds\n",
	"\n",
	" predicted2 = fit_predict_clf(clf, fold1, fold2)\n",
	" predicted1 = fit_predict_clf(clf, fold2, fold1)\n",
	"\n",
	" return (\n",
	" predicted1,\n",
	" predicted2,\n",
	" calculate_r2(predicted1, fold1[1]),\n",
	" calculate_r2(predicted2, fold2[1]),\n",
	" )"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 298,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"0.4784353278761917 0.47688073813216414\n"
	]
	}
	],
	"source": [
	"# Out of sample R^2 is reasonably high\n",
	"predicted1, predicted2, rsq1, rsq2 = two_fold_cross_fit(folds)\n",
	"print(rsq1, rsq2)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 299,
	"metadata": {},
	"outputs": [],
	"source": [
	"df[\"estimated_inst\"] = np.hstack([predicted1, predicted2])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 300,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[ 0.974303823236021 , 1.04487327033638 ] [ 0.967955093666247 , 1.05656363389298 ]\n"
	]
	}
	],
	"source": [
	"# The AR interval is the intersection of the two 97.5% AR intervals\n",
	"int1, int2 = get_ar_interval(df)\n",
	"print(int1, int2)"
	]
	}
	],
	"metadata": {
	"jupytext": {
	"formats": "ipynb,py:percent"
	},
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}