Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Comparing cov_exp_quad to alternative gp optimizations
#load packages
library(tidyverse)
library(rstan)
rstan_options(auto_write = TRUE)
# Make some fake data ----
#set random seed for reproducibility
set.seed(1)
#nX: number of unique samples on x-axis
nX = 100
#define the x-axis sample points
# on a grid:
x = seq(-5,5,length.out=nX)
# not on a grid (udx will not perform well in this case):
#x = sort(runif(nX,-5,5))
#define a wiggly function on x
f = sin(x)*dnorm(x,5,8)
plot(x,f)
#add noise
y = rnorm(nX,f,.01)
#plot
p = ggplot()+
geom_point(
data = data.frame(x,y)
, mapping = aes(
x = x
, y = y
)
, alpha = .5
)
print(p)
#Prep the data ----
#scale for default prior specification
x_scaled = (x-min(x))/max(x-min(x)) #0-1
y_scaled = scale(y)[,1] #mean0,sd1
#compute unique pairwise x differences & indices thereof for the pairwise matrix
x_unique = sort(unique(x_scaled))
nX = length(x_unique)
dx = matrix(NA,nX,nX)
for(i in 1:nX){
for(j in 1:nX){
dx[i,j] = round(-((x_unique[i]-x_unique[j])^2),10)
}
}
dx_unique = rev(sort(unique(as.vector(dx))))
DXind = matrix(NA,nX,nX)
for(i in 1:nX){
for(j in 1:nX){
DXind[i,j] = which(dx_unique==dx[i,j])
}
}
#check that we computed the above correctly
temp = matrix(NA,nX,nX)
for(i in 1:nX){
for(j in 1:nX){
temp[i,j] = dx[i,j]==dx_unique[DXind[i,j]]
}
}
all(temp)
#Sample the model that uses cov_exp_quad ----
post_ceq = rstan::stan(
file = "gp_ceq.stan"
, data=list(
nX = length(x)
, X = x_scaled
, nY = length(y)
, Y = y_scaled
, Xind = 1:nX
)
, iter = 2e2 #2e2 takes about 1m when nX=100
, chains = parallel::detectCores()
, cores = parallel::detectCores()
, seed = 1
)
alarm()
summary(rowSums(get_elapsed_time(post_ceq))/60)
#look at summary for the GP parameters ----
print(
post_ceq
, pars = c('eta','rho','noise')
, probs = c(.025,.5,.975)
, digits = 2
, use_cache = FALSE
)
#plot the CEQ model's results ----
#get posterior samples on latent function f
f = extract(
post_ceq
, pars = 'f'
)[[1]]
#revert to original scale
f = f*sd(y) + mean(y)
#compute median & 95% credible intervals
f_med = apply(f,2,median)
f_loci = apply(f,2,quantile,prob=.025)
f_hici = apply(f,2,quantile,prob=.975)
#add to plot
p = p + geom_ribbon(
data = data.frame(
x = x
, ymin = f_loci
, ymax = f_hici
)
, mapping = aes(
x = x
, ymin = ymin
, ymax = ymax
)
, alpha = .5
, fill = 'green'
)
p = p + geom_line(
data = data.frame(
x = x
, y = f_med
)
, mapping = aes(
x = x
, y = y
)
, alpha = .5
, colour = 'green'
)
print(p)
#Sample the model that uses pre-computed distances ----
post_dx = rstan::stan(
file = "gp_dx.stan"
, data=list(
nX = length(x)
, DX = dx
, nY = length(y)
, Y = y_scaled
, Xind = 1:nX
)
, iter = 2e2 #2e2 takes about 1.5m when nX=100
, chains = parallel::detectCores()
, cores = parallel::detectCores()
, seed = 1
)
alarm()
summary(rowSums(get_elapsed_time(post_dx))/60)
#slower than ceq, so ceq must have some optimizations internal to cov_exp_quad
#look at summary for the GP parameters ----
print(
post_dx
, pars = c('eta','rho','noise')
, probs = c(.025,.5,.975)
, digits = 2
, use_cache = FALSE
)
#plot the DX model's results ----
#get posterior samples on latent function f
f = extract(
post_dx
, pars = 'f'
)[[1]]
#revert to original scale
f = f*sd(y) + mean(y)
#compute median & 95% credible intervals
f_med = apply(f,2,median)
f_loci = apply(f,2,quantile,prob=.025)
f_hici = apply(f,2,quantile,prob=.975)
#add to plot
p = p + geom_ribbon(
data = data.frame(
x = x
, ymin = f_loci
, ymax = f_hici
)
, mapping = aes(
x = x
, ymin = ymin
, ymax = ymax
)
, alpha = .5
, fill = 'blue'
)
p = p + geom_line(
data = data.frame(
x = x
, y = f_med
)
, mapping = aes(
x = x
, y = y
)
, alpha = .5
, colour = 'blue'
)
print(p)
#looks good!
#sample the model that uses the precomputed unique distances----
post_udx = rstan::stan(
file = "gp_udx.stan"
, data=list(
nDX = length(dx_unique)
, dx_unique = dx_unique
, nX = length(x_unique)
, DXind = DXind
, nY = length(y)
, Y = y_scaled
, Xind = match(x,sort(unique(x)))
)
, iter = 2e2 #2e2 takes about 1m when nX=100
, chains = parallel::detectCores()
, cores = parallel::detectCores()
, seed = 1
)
alarm()
summary(rowSums(get_elapsed_time(post_udx))/60)
#about as fast ceq (indeed, ~30% faster with 2e3 chains)
#look at summary for the GP parameters ----
print(
post_udx
, pars = c('eta','rho','noise')
, probs = c(.025,.5,.975)
, digits = 2
)
#about the same as the CEQ model's result
#plot the UDX model results ----
#get posterior samples on latent function f
f = extract(
post_udx
, pars = 'f'
)[[1]]
#revert to original scale
f = f*sd(y) + mean(y)
#compute median & 95% credible intervals
f_med = apply(f,2,median)
f_loci = apply(f,2,quantile,prob=.025)
f_hici = apply(f,2,quantile,prob=.975)
#add to plot
p = p + geom_ribbon(
data = data.frame(
x = x
, ymin = f_loci
, ymax = f_hici
)
, mapping = aes(
x = x
, ymin = ymin
, ymax = ymax
)
, alpha = .5
, fill = 'red'
)
p = p + geom_line(
data = data.frame(
x = x
, y = f_med
)
, mapping = aes(
x = x
, y = y
)
, alpha = .5
, colour = 'red'
)
print(p)
#about the same as the CEQ model's result
data {
# nX: num unique x values
int<lower=1> nX ;
# nY: num observations of Y
int<lower=1> nY ;
# Y: outcomes
vector[nY] Y ;
# X: predictors
real X[nX] ;
# Xind: index of x value for each outcome
int Xind[nY] ;
}
parameters {
# eta: GP parameter, scale of latent function f
real<lower=0> eta ;
# noise: scale of measurement noise of y given f
real<lower=0> noise ;
# z: dummy variable to speed up computation of GP
# (See "Cholesky Factored and Transformed Implementation" from manual)
vector[nX] z ;
# q: dummy variable used to imply cauchy(0,1) prior on rho
# (See "Reparameterizing the Cauchy" from manual)
real<lower=0,upper=pi()/2> q ;
}
transformed parameters{
# f: latent function values
vector[nX] f ;
# rho: GP parameter, x-axis scale of influence between points (wiggliness)
# a.k.a. inverse-lengthscale
real<lower=0> rho ;
rho = tan(q) ; #implies rho ~ cauchy(0,1), peaked @ zero with heavy tail
{ #local environment to avoid saving unnecessary variables across iterations
# L: cholesky decomposition of covariance matrix
matrix[nX,nX] L ;
# covMat: covariance matrix
matrix[nX,nX] covMat ;
real eta_sq ;
eta_sq = eta^2 ;
covMat = cov_exp_quad(X, eta, 1/rho) ;
for(i in 1:nX){
covMat[i,i] = eta_sq + .001 ; #jitter diagonal for positive definiteness
}
L = cholesky_decompose(covMat) ;
f = L * z ;
}
}
model {
# rho has an implicit cauchy(0,1) prior
# prior on eta
eta ~ weibull(2,1) ; #peaked around .8
# prior on noise
noise ~ weibull(2,1) ; #peaked around .8
# z as standard normal
z ~ normal(0,1) ;
# y as f plus gaussian noise
Y ~ normal(f[Xind],noise) ;
}
data {
# nX: num unique x values
int<lower=1> nX ;
# DX: matrix of pairwise x distances
matrix[nX,nX] DX ;
# nY: num observations of Y
int<lower=1> nY ;
# Y: outcomes
vector[nY] Y ;
# Xind: index of x value for each outcome
int Xind[nY] ;
}
parameters {
# eta: GP parameter, scale of latent function f
real<lower=0> eta ;
# noise: scale of measurement noise of y given f
real<lower=0> noise ;
# z: dummy variable to speed up computation of GP
# (See "Cholesky Factored and Transformed Implementation" from manual)
vector[nX] z ;
# q: dummy variable used to imply cauchy(0,1) prior on rho
# (See "Reparameterizing the Cauchy" from manual)
real<lower=0,upper=pi()/2> q ;
}
transformed parameters{
# f: latent function values
vector[nX] f ;
# rho: GP parameter, x-axis scale of influence between points (wiggliness)
# a.k.a. inverse-lengthscale
real<lower=0> rho ;
rho = tan(q) ; #implies rho ~ cauchy(0,1), peaked @ zero with heavy tail
{ #local environment to avoid saving unnecessary variables across iterations
# L: cholesky decomposition of covariance matrix
matrix[nX,nX] L ;
# covMat: covariance matrix
matrix[nX,nX] covMat ;
real rho_sq ;
real eta_sq ;
rho_sq = rho^2 ;
eta_sq = eta^2 ;
#could do the following:
#covMat = exp(DX*rho_sq)*eta_sq ;
#then loop to add jitter to the diagonal,
#but it's faster to loop & copy:
for(i in 1:(nX-1)){
covMat[i,i] = eta_sq + .001 ; #jitter diagonal for positive definiteness
for(j in (i+1):nX){
covMat[i,j] = exp(DX[i,j]*rho_sq)*eta_sq ;
covMat[j,i] = covMat[i,j] ;
}
}
covMat[nX,nX] = eta_sq + .001 ; #final diagonal entry
L = cholesky_decompose(covMat) ;
f = L * z ;
}
}
model {
# rho has an implicit cauchy(0,1) prior
# prior on eta
eta ~ weibull(2,1) ; #peaked around .8
# prior on noise
noise ~ weibull(2,1) ; #peaked around .8
# z as standard normal
z ~ normal(0,1) ;
# y as f plus gaussian noise
Y ~ normal(f[Xind],noise) ;
}
data {
# nDX: num unique x values
int<lower=1> nDX ;
# dx: unique dx values
vector[nDX] dx_unique ;
# nX: num unique x values
int<lower=1> nX ;
# DXind: matrix of indices into dx_unique
int DXind[nX,nX] ;
# nY: num observations of Y
int<lower=1> nY ;
# Y: outcomes
vector[nY] Y ;
# Xind: index of x value for each outcome
int Xind[nY] ;
}
parameters {
# eta: GP parameter, scale of latent function f
real<lower=0> eta ;
# noise: scale of measurement noise of y given f
real<lower=0> noise ;
# z: dummy variable to speed up computation of GP
# (See "Cholesky Factored and Transformed Implementation" from manual)
vector[nX] z ;
# q: dummy variable used to imply cauchy(0,1) prior on rho
# (See "Reparameterizing the Cauchy" from manual)
real<lower=0,upper=pi()/2> q ;
}
transformed parameters{
# f: latent function values
vector[nX] f ;
# rho: GP parameter, x-axis scale of influence between points (wiggliness)
# a.k.a. inverse-lengthscale
real<lower=0> rho ;
rho = tan(q) ; #implies rho ~ cauchy(0,1), peaked @ zero with heavy tail
{ #local environment to avoid saving unnecessary variables across iterations
# L: cholesky decomposition of covariance matrix
matrix[nX,nX] L ;
# covMat: covariance matrix
matrix[nX,nX] covMat ;
# covars: unique entries in covariance matrix
vector[nDX] covars ;
covars = exp(dx_unique*(rho^2))*(eta^2) ;
covars[1] = (eta^2) + .001 ; #jitter diagonal for positive definiteness
for(i in 1:(nX-1)){
covMat[i,i] = covars[1] ;
for(j in (i+1):nX){
covMat[i,j] = covars[DXind[i,j]] ;
covMat[j,i] = covMat[i,j] ;
}
}
covMat[nX,nX] = covars[1] ; #final diagonal entry
L = cholesky_decompose(covMat) ;
f = L * z ;
}
}
model {
# rho has an implicit cauchy(0,1) prior
# prior on eta
eta ~ weibull(2,1) ; #peaked around .8
# prior on noise
noise ~ weibull(2,1) ; #peaked around .8
# z as standard normal
z ~ normal(0,1) ;
# y as f plus noise
Y ~ normal(f[Xind],noise) ;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment