josef-pkt/beta_regression.ipynb

## beta_regression.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "distinct-brooks",
   "metadata": {},
   "source": [
    "# Beta Regression in statsmodels\n",
    "\n",
    "Statsmodels includes now a model for Beta Regression where both mean and precision can depend on explanatory variables.\n",
    "\n",
    "The core results are verified against R betareg. However, api and post-estimation results are still subject to change and partially missing.\n",
    "\n",
    "Author : Josef Perktold"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "single-algorithm",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os.path\n",
    "from statsmodels.genmod import families\n",
    "links = families.links\n",
    "import pandas as pd\n",
    "import patsy\n",
    "from statsmodels.othermod.betareg import BetaModel\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "adjustable-trash",
   "metadata": {},
   "source": [
    "Loading example data from unit test folders"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "republican-concentrate",
   "metadata": {},
   "outputs": [],
   "source": [
    "import statsmodels.othermod.tests.results as btr\n",
    "dir_data = btr.__path__[0]\n",
    "income = pd.read_csv(os.path.join(dir_data, 'foodexpenditure.csv'))\n",
    "methylation = pd.read_csv(os.path.join(dir_data, 'methylation-test.csv'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "million-match",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>food</th>\n",
       "      <th>income</th>\n",
       "      <th>persons</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>15.998</td>\n",
       "      <td>62.476</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>16.652</td>\n",
       "      <td>82.304</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>21.741</td>\n",
       "      <td>74.679</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>7.431</td>\n",
       "      <td>39.151</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10.481</td>\n",
       "      <td>64.724</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     food  income  persons\n",
       "0  15.998  62.476        1\n",
       "1  16.652  82.304        5\n",
       "2  21.741  74.679        3\n",
       "3   7.431  39.151        3\n",
       "4  10.481  64.724        5"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "income.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "circular-record",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>Unnamed: 0.1</th>\n",
       "      <th>gender</th>\n",
       "      <th>age</th>\n",
       "      <th>Basename</th>\n",
       "      <th>ID</th>\n",
       "      <th>id</th>\n",
       "      <th>plate</th>\n",
       "      <th>CpG</th>\n",
       "      <th>methylation</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>F</td>\n",
       "      <td>58.231</td>\n",
       "      <td>6042324088_R04C01</td>\n",
       "      <td>age58.231_F</td>\n",
       "      <td>id_0</td>\n",
       "      <td>6042324088</td>\n",
       "      <td>CpG_0</td>\n",
       "      <td>0.815</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>M</td>\n",
       "      <td>52.632</td>\n",
       "      <td>6042324088_R06C01</td>\n",
       "      <td>age52.632_M</td>\n",
       "      <td>id_1</td>\n",
       "      <td>6042324088</td>\n",
       "      <td>CpG_0</td>\n",
       "      <td>0.803</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>M</td>\n",
       "      <td>64.679</td>\n",
       "      <td>6042324088_R01C01</td>\n",
       "      <td>age64.679_M</td>\n",
       "      <td>id_2</td>\n",
       "      <td>6042324088</td>\n",
       "      <td>CpG_0</td>\n",
       "      <td>0.803</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>F</td>\n",
       "      <td>55.299</td>\n",
       "      <td>6042324088_R04C02</td>\n",
       "      <td>age55.299_F</td>\n",
       "      <td>id_3</td>\n",
       "      <td>6042324088</td>\n",
       "      <td>CpG_0</td>\n",
       "      <td>0.808</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>M</td>\n",
       "      <td>56.019</td>\n",
       "      <td>6042324088_R02C01</td>\n",
       "      <td>age56.019_M</td>\n",
       "      <td>id_4</td>\n",
       "      <td>6042324088</td>\n",
       "      <td>CpG_0</td>\n",
       "      <td>0.855</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0  Unnamed: 0.1 gender     age           Basename           ID  \\\n",
       "0           0             0      F  58.231  6042324088_R04C01  age58.231_F   \n",
       "1           1             1      M  52.632  6042324088_R06C01  age52.632_M   \n",
       "2           2             2      M  64.679  6042324088_R01C01  age64.679_M   \n",
       "3           3             3      F  55.299  6042324088_R04C02  age55.299_F   \n",
       "4           4             4      M  56.019  6042324088_R02C01  age56.019_M   \n",
       "\n",
       "     id       plate    CpG  methylation  \n",
       "0  id_0  6042324088  CpG_0        0.815  \n",
       "1  id_1  6042324088  CpG_0        0.803  \n",
       "2  id_2  6042324088  CpG_0        0.803  \n",
       "3  id_3  6042324088  CpG_0        0.808  \n",
       "4  id_4  6042324088  CpG_0        0.855  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "methylation.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "hourly-simple",
   "metadata": {},
   "source": [
    "## Basic Example\n",
    "\n",
    "Beta Regression models are defined by two design matrices, ``exog`` and ``exog_precision`` and two link functions, `link` and `link_precision` for, respectively, mean and precision of the beta distribution.\n",
    "\n",
    "If no design matrix ``exog_precision`` is specified, then precision is assumed to be constant and internally a column array of ones is used. \n",
    "\n",
    "If either link function is not specified, then the corresponding default links, logit-link for mean and log-link for precision, are used.\n",
    "\n",
    "In the first example, we asume that precision is constant and we use the identity link for it. In that case the estimated precision parameter is directly the precision."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "complete-daisy",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                              BetaModel Results                               \n",
      "==============================================================================\n",
      "Dep. Variable:       I(food / income)   Log-Likelihood:                 45.334\n",
      "Model:                      BetaModel   AIC:                            -82.67\n",
      "Method:            Maximum Likelihood   BIC:                            -76.12\n",
      "Date:                Mon, 19 Jul 2021                                         \n",
      "Time:                        13:28:41                                         \n",
      "No. Observations:                  38                                         \n",
      "Df Residuals:                      34                                         \n",
      "Df Model:                           3                                         \n",
      "==============================================================================\n",
      "                 coef    std err          z      P>|z|      [0.025      0.975]\n",
      "------------------------------------------------------------------------------\n",
      "Intercept     -0.6225      0.221     -2.812      0.005      -1.056      -0.189\n",
      "income        -0.0123      0.003     -3.988      0.000      -0.018      -0.006\n",
      "persons        0.1185      0.036      3.315      0.001       0.048       0.189\n",
      "precision     35.6098      8.082      4.406      0.000      19.770      51.450\n",
      "==============================================================================\n"
     ]
    }
   ],
   "source": [
    "model = \"I(food/income) ~ income + persons\"\n",
    "mod_income = BetaModel.from_formula(model, income, link_precision=links.identity())\n",
    "res_income = mod_income.fit()\n",
    "print(res_income.summary())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "surface-liberty",
   "metadata": {},
   "source": [
    "The default cov_type in statsmodels is based on the observed hessian. R betareg uses by default the expected hessian. \n",
    "We can replicate this by specifying `cov_type=\"eim\"` in the call to the fit method."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "cosmetic-receiver",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                              BetaModel Results                               \n",
      "==============================================================================\n",
      "Dep. Variable:       I(food / income)   Log-Likelihood:                 45.334\n",
      "Model:                      BetaModel   AIC:                            -82.67\n",
      "Method:            Maximum Likelihood   BIC:                            -76.12\n",
      "Date:                Mon, 19 Jul 2021                                         \n",
      "Time:                        13:28:41                                         \n",
      "No. Observations:                  38                                         \n",
      "Df Residuals:                      34                                         \n",
      "Df Model:                           3                                         \n",
      "==============================================================================\n",
      "                 coef    std err          z      P>|z|      [0.025      0.975]\n",
      "------------------------------------------------------------------------------\n",
      "Intercept     -0.6225      0.224     -2.781      0.005      -1.061      -0.184\n",
      "income        -0.0123      0.003     -4.052      0.000      -0.018      -0.006\n",
      "persons        0.1185      0.035      3.352      0.001       0.049       0.188\n",
      "precision     35.6098      8.080      4.407      0.000      19.774      51.445\n",
      "==============================================================================\n"
     ]
    }
   ],
   "source": [
    "mod_income = BetaModel.from_formula(model, income, link_precision=links.identity())\n",
    "res_income_eim = mod_income.fit(cov_type=\"eim\")\n",
    "print(res_income_eim.summary())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "protective-uncertainty",
   "metadata": {},
   "source": [
    "## Methylation example\n",
    "\n",
    "In our second example we have an additional explanatory variable ``age`` for the precision.\n",
    "The default log-link will constrain the predicted precision to be positive."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "japanese-calibration",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                              BetaModel Results                               \n",
      "==============================================================================\n",
      "Dep. Variable:            methylation   Log-Likelihood:                 104.15\n",
      "Model:                      BetaModel   AIC:                            -196.3\n",
      "Method:            Maximum Likelihood   BIC:                            -186.8\n",
      "Date:                Mon, 19 Jul 2021                                         \n",
      "Time:                        13:28:42                                         \n",
      "No. Observations:                  36                                         \n",
      "Df Residuals:                      30                                         \n",
      "Df Model:                           5                                         \n",
      "================================================================================\n",
      "                   coef    std err          z      P>|z|      [0.025      0.975]\n",
      "--------------------------------------------------------------------------------\n",
      "Intercept        1.4422      0.034     42.404      0.000       1.376       1.509\n",
      "gender[T.M]      0.0699      0.044      1.603      0.109      -0.016       0.155\n",
      "CpG[T.CpG_1]     0.6073      0.048     12.563      0.000       0.513       0.702\n",
      "CpG[T.CpG_2]     0.9735      0.053     18.331      0.000       0.869       1.078\n",
      "precision-1      8.2283      1.791      4.594      0.000       4.718      11.739\n",
      "precision-2     -0.0347      0.033     -1.059      0.289      -0.099       0.030\n",
      "================================================================================\n"
     ]
    }
   ],
   "source": [
    "model = \"methylation ~ gender + CpG\"\n",
    "exog_prec = patsy.dmatrix(\"~ age\", methylation)\n",
    "mod_meth = BetaModel.from_formula(model, methylation, exog_precision=exog_prec)\n",
    "res_meth = mod_meth.fit(cov_type=\"eim\")\n",
    "print(res_meth.summary())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "latin-pressing",
   "metadata": {},
   "source": [
    "The following results are form betareg package in R.\n",
    "We see that the agreement between results is very high. The precision of parameter estimates are limited by convergence tolerance and agreement across optimization methods will only be in the range of 4 to 6 decimals."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "banned-seminar",
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "> summary(res_meth)\n",
    "\n",
    "Call:\n",
    "betareg(formula = methylation ~ gender + CpG | age, data = meth)\n",
    "\n",
    "Standardized weighted residuals 2:\n",
    "             Min               1Q           Median               3Q \n",
    "-1.9096447599437 -0.6740105877959  0.0320959147491  0.5210386717119 \n",
    "             Max \n",
    " 2.4699925639116 \n",
    "\n",
    "Coefficients (mean model with logit link):\n",
    "                    Estimate       Std. Error  z value Pr(>|z|)    \n",
    "(Intercept) 1.44224319715775 0.03401208100794 42.40385  < 2e-16 ***\n",
    "genderM     0.06985724271123 0.04359127972714  1.60255  0.10903    \n",
    "CpGCpG_1    0.60734532189829 0.04834249304140 12.56338  < 2e-16 ***\n",
    "CpGCpG_2    0.97354760812543 0.05311042410115 18.33063  < 2e-16 ***\n",
    "\n",
    "Phi coefficients (precision model with log link):\n",
    "                     Estimate        Std. Error  z value   Pr(>|z|)    \n",
    "(Intercept)  8.22828526376512  1.79098056245575  4.59429 4.3422e-06 ***\n",
    "age         -0.03470542961388  0.03276481006405 -1.05923     0.2895    \n",
    "---\n",
    "Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 \n",
    "\n",
    "Type of estimator: ML (maximum likelihood)\n",
    "Log-likelihood: 104.1480284053 on 6 Df\n",
    "Pseudo R-squared: 0.9051949114785\n",
    "Number of iterations: 12 (BFGS) + 3 (Fisher scoring) \n",
    "\n",
    "> AIC(res_meth)\n",
    "[1] -196.2960568106865\n",
    "> BIC(res_meth)\n",
    "[1] -186.7949431799498\n",
    "\"\"\";"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "employed-module",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 1.44224318,  0.06985725,  0.60734533,  0.97354761,  8.22828512,\n",
       "       -0.03470543])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.asarray(res_meth.params)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "knowing-fabric",
   "metadata": {},
   "source": [
    "The pseudo r-squared in R betareg is based on the linear predictor. The pseudo rsquared in statsmodels computes the standard Cox-Snell version based on the likelihood ratio. In this example, both versions are close to each other."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "urban-multiple",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9095835974508253"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res_meth.prsquared"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "positive-article",
   "metadata": {},
   "source": [
    "The likelihood ratio test between the full model and the constant-only model is a hypothesis test for the importance of explanatory variables. The null model assumes that both mean and precision are constant.\n",
    "\n",
    "In this example the null hypothesis that the distribution parameters are constant is strongly rejected in favor of the full model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "threaded-ladder",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(104.14802840534058,\n",
       " 60.888095894871256,\n",
       " 86.51986502093865,\n",
       " 7.218729538342336e-18)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res_meth.llnull, res_meth.llr, res_meth.llr_pvalue"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "accessory-topic",
   "metadata": {},
   "source": [
    "## Formula support\n",
    "\n",
    "Warning: support for formulas for exog_precision is incomplete\n",
    "\n",
    "When creating the model with `from_formula`, then both design matrices can be specified as formulas.\n",
    "However, the predict method of the results instance currently only handles formula transformation for the design matrix for the mean. The design matrix for precision needs to be provided as numpy array."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "tender-lafayette",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                              BetaModel Results                               \n",
      "==============================================================================\n",
      "Dep. Variable:            methylation   Log-Likelihood:                 104.15\n",
      "Model:                      BetaModel   AIC:                            -196.3\n",
      "Method:            Maximum Likelihood   BIC:                            -186.8\n",
      "Date:                Mon, 19 Jul 2021                                         \n",
      "Time:                        13:28:42                                         \n",
      "No. Observations:                  36                                         \n",
      "Df Residuals:                      30                                         \n",
      "Df Model:                           5                                         \n",
      "================================================================================\n",
      "                   coef    std err          z      P>|z|      [0.025      0.975]\n",
      "--------------------------------------------------------------------------------\n",
      "Intercept        1.4422      0.034     42.404      0.000       1.376       1.509\n",
      "gender[T.M]      0.0699      0.044      1.603      0.109      -0.016       0.155\n",
      "CpG[T.CpG_1]     0.6073      0.048     12.563      0.000       0.513       0.702\n",
      "CpG[T.CpG_2]     0.9735      0.053     18.331      0.000       0.869       1.078\n",
      "precision-1      8.2283      1.791      4.594      0.000       4.718      11.739\n",
      "precision-2     -0.0347      0.033     -1.059      0.289      -0.099       0.030\n",
      "================================================================================\n"
     ]
    }
   ],
   "source": [
    "model = \"methylation ~ gender + CpG\"\n",
    "exog_prec = patsy.dmatrix(\"~ age\", methylation)\n",
    "mod_meth = BetaModel.from_formula(model, methylation, exog_precision_formula=\"~ age\")\n",
    "res_meth = mod_meth.fit(cov_type=\"eim\")\n",
    "print(res_meth.summary())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "novel-paragraph",
   "metadata": {},
   "source": [
    "## Prediction"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cross-dividend",
   "metadata": {},
   "source": [
    "To illustrate prediction, we can create a DataFrame with only the first five observations as observation for prediction."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "offshore-netherlands",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>Unnamed: 0.1</th>\n",
       "      <th>gender</th>\n",
       "      <th>age</th>\n",
       "      <th>Basename</th>\n",
       "      <th>ID</th>\n",
       "      <th>id</th>\n",
       "      <th>plate</th>\n",
       "      <th>CpG</th>\n",
       "      <th>methylation</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>F</td>\n",
       "      <td>58.231</td>\n",
       "      <td>6042324088_R04C01</td>\n",
       "      <td>age58.231_F</td>\n",
       "      <td>id_0</td>\n",
       "      <td>6042324088</td>\n",
       "      <td>CpG_0</td>\n",
       "      <td>0.815</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>M</td>\n",
       "      <td>52.632</td>\n",
       "      <td>6042324088_R06C01</td>\n",
       "      <td>age52.632_M</td>\n",
       "      <td>id_1</td>\n",
       "      <td>6042324088</td>\n",
       "      <td>CpG_0</td>\n",
       "      <td>0.803</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>M</td>\n",
       "      <td>64.679</td>\n",
       "      <td>6042324088_R01C01</td>\n",
       "      <td>age64.679_M</td>\n",
       "      <td>id_2</td>\n",
       "      <td>6042324088</td>\n",
       "      <td>CpG_0</td>\n",
       "      <td>0.803</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>F</td>\n",
       "      <td>55.299</td>\n",
       "      <td>6042324088_R04C02</td>\n",
       "      <td>age55.299_F</td>\n",
       "      <td>id_3</td>\n",
       "      <td>6042324088</td>\n",
       "      <td>CpG_0</td>\n",
       "      <td>0.808</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>M</td>\n",
       "      <td>56.019</td>\n",
       "      <td>6042324088_R02C01</td>\n",
       "      <td>age56.019_M</td>\n",
       "      <td>id_4</td>\n",
       "      <td>6042324088</td>\n",
       "      <td>CpG_0</td>\n",
       "      <td>0.855</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0  Unnamed: 0.1 gender     age           Basename           ID  \\\n",
       "0           0             0      F  58.231  6042324088_R04C01  age58.231_F   \n",
       "1           1             1      M  52.632  6042324088_R06C01  age52.632_M   \n",
       "2           2             2      M  64.679  6042324088_R01C01  age64.679_M   \n",
       "3           3             3      F  55.299  6042324088_R04C02  age55.299_F   \n",
       "4           4             4      M  56.019  6042324088_R02C01  age56.019_M   \n",
       "\n",
       "     id       plate    CpG  methylation  \n",
       "0  id_0  6042324088  CpG_0        0.815  \n",
       "1  id_1  6042324088  CpG_0        0.803  \n",
       "2  id_2  6042324088  CpG_0        0.803  \n",
       "3  id_3  6042324088  CpG_0        0.808  \n",
       "4  id_4  6042324088  CpG_0        0.855  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "med5 = methylation.iloc[:5]\n",
    "med5"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "engaged-retrieval",
   "metadata": {},
   "source": [
    "Predict without exog argument returns the prediction for the estimation sample."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "palestinian-begin",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.80880178, 0.81937228, 0.81937228, 0.80880178, 0.81937228])"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res_meth.predict()[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "thrown-repository",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0.808802\n",
       "1    0.819372\n",
       "2    0.819372\n",
       "3    0.808802\n",
       "4    0.819372\n",
       "dtype: float64"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res_meth.predict(exog=med5, exog_precision=exog_prec[:5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "neither-belle",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([496.38578064, 602.85041173, 396.85573075, 549.55547233,\n",
       "       535.99338094])"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res_meth.predict(which=\"precision\")[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "black-program",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    496.385781\n",
       "1    602.850412\n",
       "2    396.855731\n",
       "3    549.555472\n",
       "4    535.993381\n",
       "dtype: float64"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res_meth.predict(exog=med5, exog_precision=exog_prec[:5], which=\"precision\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "auburn-sunrise",
   "metadata": {},
   "source": [
    "Note that for predicting mean and precision separately, we only need the explanatory variables for the relevant part."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "greenhouse-journal",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0.808802\n",
       "1    0.819372\n",
       "2    0.819372\n",
       "3    0.808802\n",
       "4    0.819372\n",
       "dtype: float64"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res_meth.predict(exog=med5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "endless-strip",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([496.38578064, 602.85041173, 396.85573075, 549.55547233,\n",
       "       535.99338094])"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res_meth.predict(exog_precision=exog_prec[:5], which=\"precision\")  # not a pandas Series"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cleared-receptor",
   "metadata": {},
   "source": [
    "Similarly to prediction, we can obtain a frozen distribution instance, that converts the predicted mean and precision to the parameterization of the scipy ``beta`` distribution."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "center-valentine",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'args': (array([401.47770341, 493.95891811, 325.17258608, 444.48144474,\n",
       "         439.17812016]),\n",
       "  array([ 94.90807723, 108.89149363,  71.68314466, 105.07402759,\n",
       "          96.81526078])),\n",
       " 'kwds': {},\n",
       " 'dist': <scipy.stats._continuous_distns.beta_gen at 0x94a2d9b7c0>,\n",
       " 'a': 0.0,\n",
       " 'b': 1.0}"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "distr = res_meth.get_distribution(exog=med5, exog_precision=exog_prec[:5])\n",
    "vars(distr)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "successful-vancouver",
   "metadata": {},
   "source": [
    "We can use the distribution instance to check that the mean agrees with our conditional prediction.\n",
    "Furthermore all the methods of scipy distributions like quantile functions are available for the predicted sample."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "furnished-statement",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([0.80880178, 0.81937228, 0.81937228, 0.80880178, 0.81937228]),\n",
       " array([0.00031091, 0.0002451 , 0.000372  , 0.00028088, 0.00027561]))"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "m, v = distr.stats()\n",
    "m, v"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "honest-milwaukee",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([0.79713017, 0.8090028 , 0.80665097, 0.79769782, 0.80838826]),\n",
       " array([0.80921673, 0.8197256 , 0.8199091 , 0.80917656, 0.81976969]))"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# quantiles\n",
    "distr.ppf(0.25), distr.ppf(0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "square-ability",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}