aflaxman/2019_01_17c_pymc3_in-sample_fit_stackoverflow.ipynb

## 2019_01_17c_pymc3_in-sample_fit_stackoverflow.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Thu Jan 17 10:04:34 PST 2019\r\n"
     ]
    }
   ],
   "source": [
    "import numpy as np, matplotlib.pyplot as plt, pandas as pd\n",
    "pd.set_option('display.max_rows', 8)\n",
    "!date\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pymc as pm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Problem: how to evaluate in-sample fit of a model in PyMC? https://stackoverflow.com/questions/54169191/how-to-evaluate-a-pymc2-model-with-train-test-data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " [-----------------100%-----------------] 10000 of 10000 complete in 4.6 secAccuracy on train data = 100.0%\n"
     ]
    }
   ],
   "source": [
    "number_of_samples = 10000\n",
    "X = np.random.randn(100, 2)\n",
    "Y = np.tanh(X[:, 0] + X[:, 1])\n",
    "Y = 1. / (1. + np.exp(-(Y + Y)))\n",
    "Y_train = Y > 0.5\n",
    "\n",
    "w11 = pm.Normal('w11', mu=0., tau=1.)\n",
    "w12 = pm.Normal('w12', mu=0., tau=1.)\n",
    "w21 = pm.Normal('w21', mu=0., tau=1.)\n",
    "w22 = pm.Normal('w22', mu=0., tau=1.)\n",
    "w31 = pm.Normal('w31', mu=0., tau=1.)\n",
    "w32 = pm.Normal('w32', mu=0., tau=1.)\n",
    "\n",
    "x1 = X[:, 0]\n",
    "x2 = X[:, 1]\n",
    "\n",
    "x3 = pm.Lambda('x3', lambda w1=w11, w2=w12: np.tanh(w1 * x1 + w2 * x2))\n",
    "x4 = pm.Lambda('x4', lambda w1=w21, w2=w22: np.tanh(w1 * x1 + w2 * x2))\n",
    "\n",
    "@pm.deterministic\n",
    "def sigmoid(x=w31 * x3 + w32 * x4):\n",
    "    return 1. / (1. + np.exp(-x))\n",
    "\n",
    "y = pm.Bernoulli('y', sigmoid, observed=True, value=Y_train)\n",
    "\n",
    "model = pm.Model([w11, w12, w21, w22, w31, w32, y])\n",
    "inference = pm.MCMC(model)\n",
    "\n",
    "inference.sample(number_of_samples)\n",
    "print('Accuracy on train data = {}%'.format((y.value == Y_train).mean() * 100))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Solution: use a \"posterior predictive check\" (https://docs.pymc.io/notebooks/posterior_predictive.html https://stats.stackexchange.com/questions/115157/what-are-posterior-predictive-checks-and-what-makes-them-useful)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " [-----------------100%-----------------] 10000 of 10000 complete in 5.7 sec"
     ]
    }
   ],
   "source": [
    "y_pred = pm.Bernoulli('y', sigmoid)\n",
    "\n",
    "model = pm.Model([w11, w12, w21, w22, w31, w32, y, y_pred])\n",
    "inference = pm.MCMC(model)\n",
    "\n",
    "inference.sample(number_of_samples)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy on train data = 100.0%\n"
     ]
    }
   ],
   "source": [
    "y_pred_samples = y_pred.trace()\n",
    "y_pred_threshold = (y_pred_samples.mean(axis=0) > .5)\n",
    "print('Accuracy on train data = {}%'.format((y_pred_threshold == Y_train).mean() * 100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.9856, 0.9848, 0.4217, 0.9669, 0.017 , 0.0845, 0.9027, 0.0715,\n",
       "       0.0148, 0.1049, 0.9801, 0.3527, 0.0198, 0.9633, 0.0203, 0.9824,\n",
       "       0.0297, 0.9845, 0.9714, 0.0874, 0.0188, 0.014 , 0.9502, 0.0235,\n",
       "       0.9833, 0.0392, 0.0153, 0.0138, 0.1002, 0.9865, 0.9623, 0.2509,\n",
       "       0.4159, 0.6026, 0.9634, 0.9847, 0.0572, 0.9471, 0.9571, 0.9825,\n",
       "       0.0152, 0.9388, 0.9852, 0.0173, 0.9792, 0.0143, 0.9842, 0.0515,\n",
       "       0.9866, 0.0165, 0.0792, 0.0343, 0.044 , 0.8983, 0.9335, 0.9775,\n",
       "       0.0294, 0.0766, 0.986 , 0.0221, 0.0682, 0.0611, 0.8558, 0.0816,\n",
       "       0.0254, 0.9849, 0.415 , 0.0281, 0.984 , 0.7109, 0.9857, 0.1807,\n",
       "       0.0454, 0.6199, 0.9682, 0.0487, 0.0142, 0.9728, 0.016 , 0.0268,\n",
       "       0.0276, 0.0248, 0.0189, 0.9842, 0.9832, 0.753 , 0.9834, 0.985 ,\n",
       "       0.9814, 0.5429, 0.0149, 0.9861, 0.7057, 0.9016, 0.201 , 0.9829,\n",
       "       0.0226, 0.0343, 0.126 , 0.9498])"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_pred_samples.mean(axis=0) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python (dismod_env)",
   "language": "python",
   "name": "dismod_env"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Thu Jan 17 10:04:34 PST 2019\r\n"
	]
	}
	],
	"source": [
	"import numpy as np, matplotlib.pyplot as plt, pandas as pd\n",
	"pd.set_option('display.max_rows', 8)\n",
	"!date\n",
	"\n",
	"%load_ext autoreload\n",
	"%autoreload 2"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"import pymc as pm"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Problem: how to evaluate in-sample fit of a model in PyMC? https://stackoverflow.com/questions/54169191/how-to-evaluate-a-pymc2-model-with-train-test-data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" [-----------------100%-----------------] 10000 of 10000 complete in 4.6 secAccuracy on train data = 100.0%\n"
	]
	}
	],
	"source": [
	"number_of_samples = 10000\n",
	"X = np.random.randn(100, 2)\n",
	"Y = np.tanh(X[:, 0] + X[:, 1])\n",
	"Y = 1. / (1. + np.exp(-(Y + Y)))\n",
	"Y_train = Y > 0.5\n",
	"\n",
	"w11 = pm.Normal('w11', mu=0., tau=1.)\n",
	"w12 = pm.Normal('w12', mu=0., tau=1.)\n",
	"w21 = pm.Normal('w21', mu=0., tau=1.)\n",
	"w22 = pm.Normal('w22', mu=0., tau=1.)\n",
	"w31 = pm.Normal('w31', mu=0., tau=1.)\n",
	"w32 = pm.Normal('w32', mu=0., tau=1.)\n",
	"\n",
	"x1 = X[:, 0]\n",
	"x2 = X[:, 1]\n",
	"\n",
	"x3 = pm.Lambda('x3', lambda w1=w11, w2=w12: np.tanh(w1 * x1 + w2 * x2))\n",
	"x4 = pm.Lambda('x4', lambda w1=w21, w2=w22: np.tanh(w1 * x1 + w2 * x2))\n",
	"\n",
	"@pm.deterministic\n",
	"def sigmoid(x=w31 * x3 + w32 * x4):\n",
	" return 1. / (1. + np.exp(-x))\n",
	"\n",
	"y = pm.Bernoulli('y', sigmoid, observed=True, value=Y_train)\n",
	"\n",
	"model = pm.Model([w11, w12, w21, w22, w31, w32, y])\n",
	"inference = pm.MCMC(model)\n",
	"\n",
	"inference.sample(number_of_samples)\n",
	"print('Accuracy on train data = {}%'.format((y.value == Y_train).mean() * 100))\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Solution: use a \"posterior predictive check\" (https://docs.pymc.io/notebooks/posterior_predictive.html https://stats.stackexchange.com/questions/115157/what-are-posterior-predictive-checks-and-what-makes-them-useful)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" [-----------------100%-----------------] 10000 of 10000 complete in 5.7 sec"
	]
	}
	],
	"source": [
	"y_pred = pm.Bernoulli('y', sigmoid)\n",
	"\n",
	"model = pm.Model([w11, w12, w21, w22, w31, w32, y, y_pred])\n",
	"inference = pm.MCMC(model)\n",
	"\n",
	"inference.sample(number_of_samples)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Accuracy on train data = 100.0%\n"
	]
	}
	],
	"source": [
	"y_pred_samples = y_pred.trace()\n",
	"y_pred_threshold = (y_pred_samples.mean(axis=0) > .5)\n",
	"print('Accuracy on train data = {}%'.format((y_pred_threshold == Y_train).mean() * 100))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([0.9856, 0.9848, 0.4217, 0.9669, 0.017 , 0.0845, 0.9027, 0.0715,\n",
	" 0.0148, 0.1049, 0.9801, 0.3527, 0.0198, 0.9633, 0.0203, 0.9824,\n",
	" 0.0297, 0.9845, 0.9714, 0.0874, 0.0188, 0.014 , 0.9502, 0.0235,\n",
	" 0.9833, 0.0392, 0.0153, 0.0138, 0.1002, 0.9865, 0.9623, 0.2509,\n",
	" 0.4159, 0.6026, 0.9634, 0.9847, 0.0572, 0.9471, 0.9571, 0.9825,\n",
	" 0.0152, 0.9388, 0.9852, 0.0173, 0.9792, 0.0143, 0.9842, 0.0515,\n",
	" 0.9866, 0.0165, 0.0792, 0.0343, 0.044 , 0.8983, 0.9335, 0.9775,\n",
	" 0.0294, 0.0766, 0.986 , 0.0221, 0.0682, 0.0611, 0.8558, 0.0816,\n",
	" 0.0254, 0.9849, 0.415 , 0.0281, 0.984 , 0.7109, 0.9857, 0.1807,\n",
	" 0.0454, 0.6199, 0.9682, 0.0487, 0.0142, 0.9728, 0.016 , 0.0268,\n",
	" 0.0276, 0.0248, 0.0189, 0.9842, 0.9832, 0.753 , 0.9834, 0.985 ,\n",
	" 0.9814, 0.5429, 0.0149, 0.9861, 0.7057, 0.9016, 0.201 , 0.9829,\n",
	" 0.0226, 0.0343, 0.126 , 0.9498])"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"y_pred_samples.mean(axis=0) "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python (dismod_env)",
	"language": "python",
	"name": "dismod_env"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.6"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}