NaimKabir/rct_sim.ipynb

## rct_sim.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from itertools import product\n",
    "from sklearn.preprocessing import normalize"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Participant simulation\n",
    "\n",
    "First we'll create *n* participants in our experiment."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "n = 10000"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For each of these participants, we'll randomly assign them some (Compliance, Response) behavior combination."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "compliance_partitions = [\n",
    "    \"always_taker\",\n",
    "    \"complier\",\n",
    "    \"defier\",\n",
    "    \"never_taker\",\n",
    "]\n",
    "response_partitions = [\n",
    "    \"always_better\",\n",
    "    \"helped\",\n",
    "    \"hurt\",\n",
    "    \"never_better\",\n",
    "]\n",
    "\n",
    "# Take the cross product of these sets of types\n",
    "\n",
    "partition_types = product(\n",
    "    compliance_partitions, response_partitions\n",
    ")\n",
    "partition_types = np.array(list(partition_types))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# We'll simulate probabilities that our participants\n",
    "# will belong to one of the 16 possible behavior combinations\n",
    "\n",
    "# I'm setting up a contrived example\n",
    "# to prove my point with these arbitrary floats--\n",
    "# trust me it's instructive!\n",
    "\n",
    "arbitrary_floats = np.array(\n",
    "    [\n",
    "        [\n",
    "            0,\n",
    "            0,\n",
    "            0,\n",
    "            0.01,\n",
    "            0,\n",
    "            0.14,\n",
    "            0,\n",
    "            0.16,\n",
    "            0.32,\n",
    "            0,\n",
    "            0.31,\n",
    "            0,\n",
    "            0.04,\n",
    "            0.02,\n",
    "            0,\n",
    "            0,\n",
    "        ]\n",
    "    ]\n",
    ")\n",
    "partition_probabilities = normalize(\n",
    "    arbitrary_floats, \"l1\"\n",
    ")\n",
    "partition_probabilities = (\n",
    "    partition_probabilities.flatten()\n",
    ")\n",
    "\n",
    "# to be a true set of probabilities, the vector sum\n",
    "# needs to be 1\n",
    "\n",
    "# sometimes this can fail because of precision errors,\n",
    "# so let's assert it\n",
    "\n",
    "assert partition_probabilities.sum() == 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# drawing participant compliance and response behaviors according to the\n",
    "# specified distribution\n",
    "\n",
    "participant_partition = np.random.choice(\n",
    "    range(len(partition_types)),\n",
    "    n,\n",
    "    p=partition_probabilities,\n",
    ")\n",
    "\n",
    "compliance_response_pairs = zip(\n",
    "    *partition_types[participant_partition]\n",
    ")\n",
    "compliance_type, response_type = list(\n",
    "    compliance_response_pairs\n",
    ")\n",
    "\n",
    "# assigning participants to Control and Treatment groups\n",
    "# with 50% probability\n",
    "\n",
    "assignments = [\"control\", \"treatment\"]\n",
    "participant_assignment = np.random.choice(\n",
    "    assignments, n\n",
    ")\n",
    "\n",
    "# compiling all information into a dataframe\n",
    "# that simulates the participants\n",
    "\n",
    "df = pd.DataFrame(\n",
    "    {\n",
    "        \"assignment\": participant_assignment,\n",
    "        \"compliance_type\": compliance_type,\n",
    "        \"response_type\": response_type,\n",
    "    }\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Simulate whether participants took treatment\n",
    "\n",
    "Depending on assignment and compliance type, we can simulate whether or not each participant took the treatment."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# if the participant is an always_taker,\n",
    "# they'll always take the treatment.\n",
    "\n",
    "df[\"took_treatment\"] = (\n",
    "    df.compliance_type == \"always_taker\"\n",
    ")\n",
    "\n",
    "# if they're a complier, they'll take the treatment\n",
    "# as long as they're in the treatment condition.\n",
    "\n",
    "df[\"took_treatment\"] = df[\"took_treatment\"] | (\n",
    "    (df.compliance_type == \"complier\")\n",
    "    & (df.assignment == \"treatment\")\n",
    ")\n",
    "\n",
    "# if they're a defier, they'll only take the treatment\n",
    "# if they were in the control condition.\n",
    "\n",
    "df[\"took_treatment\"] = df[\"took_treatment\"] | (\n",
    "    (df.compliance_type == \"defier\")\n",
    "    & (df.assignment == \"control\")\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Simulate Outcomes\n",
    "\n",
    "Now we can simulate outcomes from the experiment.\n",
    "\n",
    "Depending on whether they took the treatment and their `response_type`, did they end up in a Good or Bad state after the experiment's conclusion?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# if the participant is of the always_better type,\n",
    "# they'll definitely have a good outcome.\n",
    "\n",
    "df[\"good_outcome\"] = (\n",
    "    df.response_type == \"always_better\"\n",
    ")\n",
    "\n",
    "# if the participant is of the 'helped' type,\n",
    "# they'll have a good outcome as long as they\n",
    "# took treatment.\n",
    "\n",
    "df[\"good_outcome\"] = df[\"good_outcome\"] | (\n",
    "    (df.response_type == \"helped\")\n",
    "    & (df.took_treatment)\n",
    ")\n",
    "\n",
    "# if the participant is of the 'hurt' type,\n",
    "# they'll have a good outcome as long as they\n",
    "# did NOT take the treatment!\n",
    "\n",
    "df[\"good_outcome\"] = df[\"good_outcome\"] | (\n",
    "    (df.response_type == \"hurt\")\n",
    "    & (~df.took_treatment)\n",
    ")\n",
    "\n",
    "# Otherwise, the outcome is going to be bad\n",
    "# and the column will have a False value."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can now observe the probabilities of each (Treatment, Outcome) combination that would emerge, conditional on the assignment."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[\"n\"] = 1\n",
    "results = (\n",
    "    df.groupby(\n",
    "        [\n",
    "            \"assignment\",\n",
    "            \"took_treatment\",\n",
    "            \"good_outcome\",\n",
    "        ]\n",
    "    )\n",
    "    .count()\n",
    "    .n\n",
    ")\n",
    "results = results.to_frame()\n",
    "results[\"assignment_n\"] = results.groupby(\n",
    "    \"assignment\"\n",
    ").transform(\"sum\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>P( X, Y | Z )</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>assignment</th>\n",
       "      <th>took_treatment</th>\n",
       "      <th>good_outcome</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"4\" valign=\"top\">control</th>\n",
       "      <th rowspan=\"2\" valign=\"top\">False</th>\n",
       "      <th>False</th>\n",
       "      <td>0.315285</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>True</th>\n",
       "      <td>0.043756</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">True</th>\n",
       "      <th>False</th>\n",
       "      <td>0.323676</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>True</th>\n",
       "      <td>0.317283</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"4\" valign=\"top\">treatment</th>\n",
       "      <th rowspan=\"2\" valign=\"top\">False</th>\n",
       "      <th>False</th>\n",
       "      <td>0.019820</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>True</th>\n",
       "      <td>0.670671</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">True</th>\n",
       "      <th>False</th>\n",
       "      <td>0.167968</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>True</th>\n",
       "      <td>0.141542</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        P( X, Y | Z )\n",
       "assignment took_treatment good_outcome               \n",
       "control    False          False              0.315285\n",
       "                          True               0.043756\n",
       "           True           False              0.323676\n",
       "                          True               0.317283\n",
       "treatment  False          False              0.019820\n",
       "                          True               0.670671\n",
       "           True           False              0.167968\n",
       "                          True               0.141542"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "p_states = results.n / results.assignment_n\n",
    "p_states = p_states.rename(\"P( X, Y | Z )\")\n",
    "\n",
    "# display\n",
    "p_states.to_frame()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import pandas as pd\n",
	"from itertools import product\n",
	"from sklearn.preprocessing import normalize"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### Participant simulation\n",
	"\n",
	"First we'll create n participants in our experiment."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"n = 10000"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"For each of these participants, we'll randomly assign them some (Compliance, Response) behavior combination."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"compliance_partitions = [\n",
	" \"always_taker\",\n",
	" \"complier\",\n",
	" \"defier\",\n",
	" \"never_taker\",\n",
	"]\n",
	"response_partitions = [\n",
	" \"always_better\",\n",
	" \"helped\",\n",
	" \"hurt\",\n",
	" \"never_better\",\n",
	"]\n",
	"\n",
	"# Take the cross product of these sets of types\n",
	"\n",
	"partition_types = product(\n",
	" compliance_partitions, response_partitions\n",
	")\n",
	"partition_types = np.array(list(partition_types))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"# We'll simulate probabilities that our participants\n",
	"# will belong to one of the 16 possible behavior combinations\n",
	"\n",
	"# I'm setting up a contrived example\n",
	"# to prove my point with these arbitrary floats--\n",
	"# trust me it's instructive!\n",
	"\n",
	"arbitrary_floats = np.array(\n",
	" [\n",
	" [\n",
	" 0,\n",
	" 0,\n",
	" 0,\n",
	" 0.01,\n",
	" 0,\n",
	" 0.14,\n",
	" 0,\n",
	" 0.16,\n",
	" 0.32,\n",
	" 0,\n",
	" 0.31,\n",
	" 0,\n",
	" 0.04,\n",
	" 0.02,\n",
	" 0,\n",
	" 0,\n",
	" ]\n",
	" ]\n",
	")\n",
	"partition_probabilities = normalize(\n",
	" arbitrary_floats, \"l1\"\n",
	")\n",
	"partition_probabilities = (\n",
	" partition_probabilities.flatten()\n",
	")\n",
	"\n",
	"# to be a true set of probabilities, the vector sum\n",
	"# needs to be 1\n",
	"\n",
	"# sometimes this can fail because of precision errors,\n",
	"# so let's assert it\n",
	"\n",
	"assert partition_probabilities.sum() == 1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"# drawing participant compliance and response behaviors according to the\n",
	"# specified distribution\n",
	"\n",
	"participant_partition = np.random.choice(\n",
	" range(len(partition_types)),\n",
	" n,\n",
	" p=partition_probabilities,\n",
	")\n",
	"\n",
	"compliance_response_pairs = zip(\n",
	" *partition_types[participant_partition]\n",
	")\n",
	"compliance_type, response_type = list(\n",
	" compliance_response_pairs\n",
	")\n",
	"\n",
	"# assigning participants to Control and Treatment groups\n",
	"# with 50% probability\n",
	"\n",
	"assignments = [\"control\", \"treatment\"]\n",
	"participant_assignment = np.random.choice(\n",
	" assignments, n\n",
	")\n",
	"\n",
	"# compiling all information into a dataframe\n",
	"# that simulates the participants\n",
	"\n",
	"df = pd.DataFrame(\n",
	" {\n",
	" \"assignment\": participant_assignment,\n",
	" \"compliance_type\": compliance_type,\n",
	" \"response_type\": response_type,\n",
	" }\n",
	")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Simulate whether participants took treatment\n",
	"\n",
	"Depending on assignment and compliance type, we can simulate whether or not each participant took the treatment."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"# if the participant is an always_taker,\n",
	"# they'll always take the treatment.\n",
	"\n",
	"df[\"took_treatment\"] = (\n",
	" df.compliance_type == \"always_taker\"\n",
	")\n",
	"\n",
	"# if they're a complier, they'll take the treatment\n",
	"# as long as they're in the treatment condition.\n",
	"\n",
	"df[\"took_treatment\"] = df[\"took_treatment\"] \| (\n",
	" (df.compliance_type == \"complier\")\n",
	" & (df.assignment == \"treatment\")\n",
	")\n",
	"\n",
	"# if they're a defier, they'll only take the treatment\n",
	"# if they were in the control condition.\n",
	"\n",
	"df[\"took_treatment\"] = df[\"took_treatment\"] \| (\n",
	" (df.compliance_type == \"defier\")\n",
	" & (df.assignment == \"control\")\n",
	")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Simulate Outcomes\n",
	"\n",
	"Now we can simulate outcomes from the experiment.\n",
	"\n",
	"Depending on whether they took the treatment and their `response_type`, did they end up in a Good or Bad state after the experiment's conclusion?"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [],
	"source": [
	"# if the participant is of the always_better type,\n",
	"# they'll definitely have a good outcome.\n",
	"\n",
	"df[\"good_outcome\"] = (\n",
	" df.response_type == \"always_better\"\n",
	")\n",
	"\n",
	"# if the participant is of the 'helped' type,\n",
	"# they'll have a good outcome as long as they\n",
	"# took treatment.\n",
	"\n",
	"df[\"good_outcome\"] = df[\"good_outcome\"] \| (\n",
	" (df.response_type == \"helped\")\n",
	" & (df.took_treatment)\n",
	")\n",
	"\n",
	"# if the participant is of the 'hurt' type,\n",
	"# they'll have a good outcome as long as they\n",
	"# did NOT take the treatment!\n",
	"\n",
	"df[\"good_outcome\"] = df[\"good_outcome\"] \| (\n",
	" (df.response_type == \"hurt\")\n",
	" & (~df.took_treatment)\n",
	")\n",
	"\n",
	"# Otherwise, the outcome is going to be bad\n",
	"# and the column will have a False value."
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"We can now observe the probabilities of each (Treatment, Outcome) combination that would emerge, conditional on the assignment."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [],
	"source": [
	"df[\"n\"] = 1\n",
	"results = (\n",
	" df.groupby(\n",
	" [\n",
	" \"assignment\",\n",
	" \"took_treatment\",\n",
	" \"good_outcome\",\n",
	" ]\n",
	" )\n",
	" .count()\n",
	" .n\n",
	")\n",
	"results = results.to_frame()\n",
	"results[\"assignment_n\"] = results.groupby(\n",
	" \"assignment\"\n",
	").transform(\"sum\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th>P( X, Y \| Z )</th>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>assignment</th>\n",
	" <th>took_treatment</th>\n",
	" <th>good_outcome</th>\n",
	" <th></th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th rowspan=\"4\" valign=\"top\">control</th>\n",
	" <th rowspan=\"2\" valign=\"top\">False</th>\n",
	" <th>False</th>\n",
	" <td>0.315285</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>True</th>\n",
	" <td>0.043756</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th rowspan=\"2\" valign=\"top\">True</th>\n",
	" <th>False</th>\n",
	" <td>0.323676</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>True</th>\n",
	" <td>0.317283</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th rowspan=\"4\" valign=\"top\">treatment</th>\n",
	" <th rowspan=\"2\" valign=\"top\">False</th>\n",
	" <th>False</th>\n",
	" <td>0.019820</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>True</th>\n",
	" <td>0.670671</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th rowspan=\"2\" valign=\"top\">True</th>\n",
	" <th>False</th>\n",
	" <td>0.167968</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>True</th>\n",
	" <td>0.141542</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" P( X, Y \| Z )\n",
	"assignment took_treatment good_outcome \n",
	"control False False 0.315285\n",
	" True 0.043756\n",
	" True False 0.323676\n",
	" True 0.317283\n",
	"treatment False False 0.019820\n",
	" True 0.670671\n",
	" True False 0.167968\n",
	" True 0.141542"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"p_states = results.n / results.assignment_n\n",
	"p_states = p_states.rename(\"P( X, Y \| Z )\")\n",
	"\n",
	"# display\n",
	"p_states.to_frame()"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.5"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}