Skip to content

Instantly share code, notes, and snippets.

@brandonwillard
Last active August 27, 2021 22:41
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save brandonwillard/f24cf0630f02cd03440f635fd7d49ada to your computer and use it in GitHub Desktop.
Save brandonwillard/f24cf0630f02cd03440f635fd7d49ada to your computer and use it in GitHub Desktop.
Normal Regression with Horseshoe prior Gibbs Sampling for an Under-determined Model
import time
from datetime import date, timedelta
import arviz as az
import formulaic
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
# from polyagamma import random_polyagamma
X_formula = formulaic.Formula(
"obs ~ 1 + C(month) * C(weekday) * C(hour) * C(factor_1) * C(factor_2) * C(factor_3) * C(factor_4)"
)
def simulate_data(days_ago=100, alpha=100):
as_of_date = date(2021, 2, 11)
start_date, end_date = as_of_date + timedelta(days=-days_ago), as_of_date
date_rng = pd.date_range(start=start_date, end=end_date, freq="H")
N = len(date_rng)
factor_1 = np.random.randint(0, 4, size=N)
factor_2 = np.random.randint(0, 4, size=N)
factor_3 = np.random.randint(0, 1, size=N)
factor_4 = np.random.randint(0, 1, size=N)
factor_5 = np.random.randint(0, 1, size=N)
data = pd.DataFrame({"obs": 0}, index=date_rng)
data = data.assign(
month=pd.Categorical(data.index.month, categories=list(range(1, 13))),
weekday=pd.Categorical(data.index.weekday, categories=list(range(7))),
hour=pd.Categorical(data.index.hour, categories=list(range(24))),
factor_1=factor_1,
factor_2=factor_2,
factor_3=factor_3,
factor_4=factor_4,
factor_5=factor_5,
)
_, X_csr = X_formula.get_model_matrix(data, output="sparse")
beta = np.zeros(X_csr.shape[-1])
beta[0 : (X_csr.shape[1] // 100)] = np.random.normal(
np.exp(np.linspace(np.log(5), 0, num=X_csr.shape[1] // 100)), 1
)
eta = np.asarray(X_csr.dot(beta))
# Binomial regression with N ~ Poisson
# z = scipy.special.expit(eta)
# q = np.random.beta(alpha * eta, alpha * (1 - eta))
# obs = np.random.poisson(population_size * q)
# Negative binomial regression
# mu = np.exp(eta)
# p = alpha / (mu + alpha)
# obs = np.random.negative_binomial(alpha, p)
obs = np.random.normal(eta, 1)
data["obs"] = obs
return data, beta
np.random.seed(4223)
sim_data, true_beta = simulate_data()
def large_p_mvnormal_sampler(D_diag, Phi, a):
r"""Efficiently sample from a large multivariate normal.
This function draws samples from the following distribution:
.. math::
\beta \sim \operatorname{N}\left( \mu, \Sigma \right)
where
.. math::
\mu = \Sigma \Phi^\top a, \\
\Sigma = \left( \Phi^\top \Phi + D^{-1} \right)^{-1}
and :math:`a \in \mathbb{R}^{n}`, :math:`\Phi \in \mathbb{R}^{n \times p}`.
This approach is particularly effective when :math:`p \gg n`.
From "Fast sampling with Gaussian scale-mixture priors in high-dimensional
regression", Bhattacharya, Chakraborty, and Mallick, 2015.
"""
N = a.shape[0]
u = np.random.normal(0, np.sqrt(D_diag))
delta = np.random.normal(size=N)
v = Phi * u + delta
Phi_D = Phi.multiply(D_diag)
Z = (Phi_D * Phi.T + scipy.sparse.eye(N)).toarray()
w = scipy.linalg.solve(Z, a - v, assume_a="sym")
beta = u + Phi_D.T * w
return beta
def hs_gibbs(data, num_samp=5000, alpha=100, tau2=1.0):
"""Generate posterior samples for a regression with an HS prior."""
y_csr, X_csr = X_formula.get_model_matrix(data, output="sparse")
X_csc = X_csr.tocsc()
N, M = X_csc.shape
y = y_csr.toarray().squeeze()
lambdas = np.full(M, 1)
etas = np.full(M, 1)
beta = np.zeros(M)
omega = np.ones_like(y)
# The augmented regression is `y ~ N(X @ beta, sigma2)` where `sigma2 = diag(1/omega)`
sigma2 = 1.0
sigma = np.sqrt(sigma2)
y_aug = y
trace = {"beta": [], "lambda": []}
for i in range(num_samp):
if i % 10 == 0:
print(f"i = {i}")
# TODO: Re-enable this for Polya-gamma negative-binomial models
# random_polyagamma(alpha + y, X_csc * beta - np.log(alpha), out=omega)
# y_aug = np.log(alpha) + (y - alpha) / (2.0 * omega)
#
# Efficient under-determined sampling steps...
#
# D_inv_diag = t2 / tau2
V_diag_inv = np.abs(omega)
Phi = X_csc.T.multiply(np.sqrt(V_diag_inv)).T
# Phi.nnz / np.prod(Phi.shape) == 0.003938556107574225
D_diag = tau2 * np.power(lambdas, 2)
beta = sigma * large_p_mvnormal_sampler(D_diag, Phi, y_aug / sigma)
#
# Use slice sampling on `eta = 1 / lambda**2`
#
etas = 1.0 / np.power(lambdas, 2)
u_eta = np.random.uniform(0, 1.0 / (1 + etas))
exp_rate = np.power(beta, 2) / (2 * sigma2 * tau2)
slice_upper_lim = (1 - u_eta) / u_eta
# SciPy's truncated sampler needs an upper limit parameter that's
# manually scaled by its `scale` parameter: e.g.
# scipy.stats.truncexpon.rvs(upper_lim / scale, scale=scale)
etas = scipy.stats.truncexpon.rvs(
exp_rate / slice_upper_lim, scale=1 / exp_rate
)
lambdas = 1 / np.sqrt(etas)
trace["beta"].append(beta)
trace["lambda"].append(lambdas)
return trace
with np.errstate(over="raise"):
np.random.seed(90923)
start_time = time.perf_counter()
sample_trace = hs_gibbs(sim_data, 100, tau2=1)
end_time = time.perf_counter()
time_diff = end_time - start_time
sample_rate = len(sample_trace["beta"]) / time_diff
print(sample_rate)
beta_samples = np.asarray(sample_trace["beta"])
beta_residuals = beta_samples - true_beta
beta_mean = beta_samples.mean(0)
(beta_mean - true_beta).mean()
# 0.5483206279599594
# %matplotlib qt
aehmc_trace = az.from_dict(
posterior={k: np.asarray([v]) for k, v in sample_trace.items()}
)
# ax = az.plot_trace(aehmc_trace)
beta_subset = aehmc_trace.posterior["beta"].to_dataframe()
# Let's look at the terms that are known to be non-zero...
plot_slice = slice(0, 100)
ax = beta_subset.loc[(0, slice(None), plot_slice)].boxplot(
by="beta_dim_0", column="beta", rot=90
)
plt.scatter(
np.arange(plot_slice.stop - plot_slice.start) + 1,
true_beta[plot_slice],
color="red",
label="true_beta",
)
# plt.yscale("symlog")
plt.legend()
plt.tight_layout()
# Let's look at the terms that are known to be zero...
plot_slice = slice(true_beta.shape[0] - 200, true_beta.shape[0])
ax = beta_subset.loc[(0, slice(None), plot_slice)].boxplot(
by="beta_dim_0", column="beta", rot=90
)
plt.scatter(
np.arange(plot_slice.stop - plot_slice.start) + 1,
true_beta[plot_slice],
color="red",
label="true_beta",
)
plt.yscale("symlog")
plt.legend()
plt.tight_layout()
# TODO: 10675 has way too much variance; is that a numerical issue?
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment