sashagusev/GP_fixed.R

## GP_fixed.R
require(MASS)

## This is work based on Chapter 2 of Rasmussen and Williams, availabler at [ http://www.gaussianprocess.org/ ].

### GP solution and plotting:

gp_solve_basis = function( x.train , y.train , x.pred , h.train, h.pred , kernel , sigma2e = 0 ) {
	# Args:
	#	x.train:	matrix of training data
	#	y.train:	matrix of training labels
	#	x.pred:		matrix of prediction data
	#	h.train:	matrix of fixed-effects/basis functions for training data
	#	h.pred:		matrix of fixed-effects for test data
	#	kernel:		function for computing covariance of matrices
	#	sigma2e:	amount of noise in the covariance

	solution = list()
	# compute training covariance matrix (used to get relationships in training)
	k.xx = kernel(x.train,x.train)
	# compute covariance between training and testing (used to predict weights into new data), these are redundant
	k.x_xp = kernel(x.train,x.pred)
	k.xp_x = kernel(x.pred,x.train)
	# compute covariance between testing (used to estimate covariance of predictions in new data)
	k.xp_xp = kernel(x.pred,x.pred)

	# invert the variance component
	Vinv = solve(k.xx + sigma2e * diag(1, ncol(k.xx)))
	# define b and Binv, the priors on the fixed effects
	# invert the fixed effect matrix
	b = matrix(0,ncol=1,nrow=ncol(h.train))
	B.var = diag(ncol(h.train))
	Binv = solve(B.var)

	# compute the estimate and variance for the prediction including fixed basis [eq 2.41]
	# old mean: solution[["mu"]] = k.xp_x %*% Vinv %*% y.train
	# old var:  solution[["var"]] = k.xp_xp - k.xp_x %*% Vinv %*% k.x_xp

	## we reuse the old (without fixed effects) variance term:
	no_fe.var = k.xp_xp - k.xp_x %*% Vinv %*% k.x_xp
	## we reuse the old (without fixed effects) mean term:
	no_fe.mu = k.xp_x %*% Vinv %*% y.train

	## b.bar: solve for the estimated fixed effects
	# first paren gets the fixed effects on the V residual, and solves with a ridge
	# second paren regresses fixed effects on y on the V residual, and solves with a ridge
	b.bar = solve(Binv + t(h.train) %*% Vinv %*% h.train) %*% (t(h.train) %*% Vinv %*% y.train + Binv %*% b)

	## R : solve for the fixed effects on the V residual
	R = t(h.pred) - t(h.train) %*% Vinv %*% k.x_xp

	## new prediction
	# now includes a b.bar offset and y residualized on b.bar
	# expanded form:
	# solution[["mu"]] = h.train %*% b.bar + K.xp_x %*% Vinv %*% (y.train - h %*% b.bar)
	# compact form:
	# prediction is now the old prediction plus the fixed effects predicted on the V residual
	solution[["mu"]] = no_fe.mu + t(R) %*% b.bar

	## new covariance
	# we take the old variance, and add the uncertainty in the h
	solution[["var"]] = no_fe.var + t(R) %*% solve(Binv + t(h) %*% Vinv %*% h) %*% R
	solution[["bbar"]] = b.bar
	solution[["h"]] = h.pred
	solution[["f_h"]] = h.pred %*% b.bar
	solution[["x"]] = x.pred
	return( solution )
}

gp_plot = function( x , y , GP , n.sample=0 , ... ) {
	# Args:
	#	x:				x-values of training data
	#	y:				y-values of training data
	#	GP:				list holding the GP solution
	#	GP[["x"]]:		the GP test points
	#	GP[["mu"]]:		the GP prediction
	#	GP[["var"]]:	the GP prediction covariance
	#	GP[["h"]]:		the fixed basis points
	#	GP[["f_h"]]:	the predicted fixed basis points
	#	GP[["bbar"]]:	the learned \betas

	# scale around confidence intervals
	# yrng = range( c(y,mu.pred + 2*sqrt(diag(cov.pred)),mu.pred - 2*sqrt(diag(cov.pred))) , na.rm=T )
	# scale around training data (useful for multiple plots of same data)
	# yrng = range( y , na.rm=T ) * 1.05
	# xrng = range( c(x,x.pred) , na.rm=T )

	mu.pred = GP[["mu"]]
	cov.pred = GP[["var"]]
	x.pred = GP[["x"]]

	plot(x,y,type="n",xlab="x",ylab="y" , ... )

	# plot the confidence interval (diagonal of the predicted cov is the variance of each prediction)
	# lines( x.pred , mu.pred + 2*sqrt(diag(cov.pred)) )
	# lines( x.pred , mu.pred - 2*sqrt(diag(cov.pred)) )
	# plot using pretty polygons
	se.pred = 2*sqrt(diag(cov.pred))
	poly.x = c(x.pred[1],x.pred,x.pred[length(x.pred)],rev(x.pred))
	poly.y = c( (mu.pred - se.pred)[1] , mu.pred + se.pred , (mu.pred - se.pred)[length(x.pred)] , rev(mu.pred - se.pred) )
	polygon( poly.x , poly.y , col="#3182bd40" , border="#3182bd50")

	# plot samples from the posterior
	if ( n.sample > 0 ) {
		for (i in 1:n.sample) {
		  samp.y = mvrnorm(1, mu.pred, cov.pred)
		  lines( x.pred , samp.y , col="#666666")
		}
	}

	# plot the prediction
	lines( x.pred , mu.pred , col="#3182bd50",lwd=2)

	# plot the fixed effects (if any)
	if ( !is.null( GP[["f_h"]] ) ) {
		lines( x.pred , GP[["f_h"]] , col="gray", lwd = 2 )
	}
	# plot the observations
	points(x,y,col="black",pch=19,cex=0.75)
	box()
}

### Kernels:
# linear kernel
kfunc_linear = function(x1,x2) {
	return( x1 %*% t(x2) )
}

# linear * linear = quadtratic
kfunc_linear2 = function(x1,x2) {
	return( kfunc_linear(x1,x2)^2 )
}

# exponential kernel
kfunc_exp = function(x1,x2) {
  Sigma <- matrix(rep(0, length(x1)*length(x2)), nrow=length(x1))
  for (i in 1:nrow(Sigma)) {
    for (j in 1:ncol(Sigma)) {
      Sigma[i,j] <- exp(-1*(abs(x1[i]-x2[j])))
    }
  }
  return(Sigma)
}

# browninan kernel
kfunc_brown = function(x1,x2,l=1) {
  Sigma <- matrix(rep(0, length(x1)*length(x2)), nrow=length(x1))
  for (i in 1:nrow(Sigma)) {
    for (j in 1:ncol(Sigma)) {
      Sigma[i,j] <- min(x1[i],x2[j])
    }
  }
  return(Sigma)
}

# matern kernel
kfunc_matern = function(x1,x2,l=1) {
  Sigma <- matrix(rep(0, length(x1)*length(x2)), nrow=length(x1))
  for (i in 1:nrow(Sigma)) {
    for (j in 1:ncol(Sigma)) {
      Sigma[i,j] <- (1 + abs(x1[i]-x2[j])) * exp(-1*abs(x1[i]-x2[j]))
    }
  }
  return(Sigma)
}

# gaussian kernel
kfunc_gauss = function(x1,x2,l=1) {
  Sigma <- matrix(rep(0, length(x1)*length(x2)), nrow=length(x1))
  for (i in 1:nrow(Sigma)) {
    for (j in 1:ncol(Sigma)) {
      Sigma[i,j] <- exp(-0.5*(x1[i]-x2[j])^2)
    }
  }
  return(Sigma)
}

# sin kernel
kfunc_sinc = function(x1,x2,l=1) {
  Sigma <- matrix(rep(0, length(x1)*length(x2)), nrow=length(x1))
  for (i in 1:nrow(Sigma)) {
    for (j in 1:ncol(Sigma)) {
      Sigma[i,j] <- sin(abs(x1[i]-x2[j]))/abs(x1[i]-x2[j])
    }
  }
  diag(Sigma) = 1
  return(Sigma)
}

### Analyses:

## mean shift
x = matrix( rnorm(10,0,2) ,ncol=1)
x = (x - min(x)) * 4 / (max(x)-min(x)) - 2
h = matrix(1,ncol=1,nrow=nrow(x))
y = 10 * h + sin(x)

yrng = c(0,15)
xrng = c(-5,5)
x.test = matrix(seq(-5,5,len=100),nrow=100,ncol=1)
h.test = matrix(1,ncol=1,nrow=nrow(x.test))

svg("plot_gp_mean_est.svg",width=7,height=6)
par(mfrow=c(1,2),cex.axis=0.75,las=1)
gp = gp_solve_basis( x , y , x.test , matrix(0,ncol=1,nrow=nrow(x)) , matrix(0,ncol=1,nrow=nrow(x.test)) , kfunc_gauss , 0.2)
gp_plot( x , y , gp , ylim=yrng , xlim=xrng , main="Zero mean",xaxs="i")

gp = gp_solve_basis( x , y , x.test , h , h.test , kfunc_gauss , 0.2)
gp_plot( x , y , gp , ylim=yrng , xlim=xrng , main="Inferred mean",xaxs="i")
dev.off()

## mixed effects
svg("plot_gp_basis.svg",width=8,height=8)
par(mfrow=c(2,2),cex.axis=0.75,las=1)
yrng = c(-1,30)
xrng = c(-2,2)

# linear function
x = matrix( rnorm(10,0,2) ,ncol=1)
x = (x - min(x)) * 4 / (max(x)-min(x)) - 2
h = cbind(rep(1,nrow(x)),x)

y = h %*% c(10,-5)
plot( x , y , pch=19 , col="black" , xlab="x" , ylab="y" , main="1: Linear effect\nb = -5", ylim=yrng , cex=0.75 , xlim=xrng)

# non-linear noise
u = 2*x^2
plot( x , u , pch=19 , col="black" , xlab="x" , ylab="y" , main="2: Non-linear effect" , ylim=yrng , cex=0.75 , xlim=xrng)

# combine the two
y = y + u
plot( x , y , pch=19 , col="black" , xlab="x" , ylab="y" , main="3: (1) + (2)" , ylim=yrng , cex=0.75 , xlim=xrng)

# define some testing points
x.test = matrix(seq(-4,4,len=100),nrow=100,ncol=1)
h.test = cbind(rep(1,nrow(x.test)),x.test)

gp = gp_solve_basis( x , y , x.test , h , h.test , kfunc_linear2 , 0.1 )
gp_plot( x , y , gp , main=paste("4: GP with inferred basis\nb-est =",round(gp[["bbar"]][2],1)) , ylim=yrng , xlim=xrng)
dev.off()
	require(MASS)

	## This is work based on Chapter 2 of Rasmussen and Williams, availabler at [ http://www.gaussianprocess.org/ ].

	### GP solution and plotting:

	gp_solve_basis = function( x.train , y.train , x.pred , h.train, h.pred , kernel , sigma2e = 0 ) {
	# Args:
	# x.train: matrix of training data
	# y.train: matrix of training labels
	# x.pred: matrix of prediction data
	# h.train: matrix of fixed-effects/basis functions for training data
	# h.pred: matrix of fixed-effects for test data
	# kernel: function for computing covariance of matrices
	# sigma2e: amount of noise in the covariance

	solution = list()
	# compute training covariance matrix (used to get relationships in training)
	k.xx = kernel(x.train,x.train)
	# compute covariance between training and testing (used to predict weights into new data), these are redundant
	k.x_xp = kernel(x.train,x.pred)
	k.xp_x = kernel(x.pred,x.train)
	# compute covariance between testing (used to estimate covariance of predictions in new data)
	k.xp_xp = kernel(x.pred,x.pred)

	# invert the variance component
	Vinv = solve(k.xx + sigma2e * diag(1, ncol(k.xx)))
	# define b and Binv, the priors on the fixed effects
	# invert the fixed effect matrix
	b = matrix(0,ncol=1,nrow=ncol(h.train))
	B.var = diag(ncol(h.train))
	Binv = solve(B.var)

	# compute the estimate and variance for the prediction including fixed basis [eq 2.41]
	# old mean: solution[["mu"]] = k.xp_x %% Vinv %% y.train
	# old var: solution[["var"]] = k.xp_xp - k.xp_x %% Vinv %% k.x_xp

	## we reuse the old (without fixed effects) variance term:
	no_fe.var = k.xp_xp - k.xp_x %% Vinv %% k.x_xp
	## we reuse the old (without fixed effects) mean term:
	no_fe.mu = k.xp_x %% Vinv %% y.train

	## b.bar: solve for the estimated fixed effects
	# first paren gets the fixed effects on the V residual, and solves with a ridge
	# second paren regresses fixed effects on y on the V residual, and solves with a ridge
	b.bar = solve(Binv + t(h.train) %% Vinv %% h.train) %% (t(h.train) %% Vinv %% y.train + Binv %% b)

	## R : solve for the fixed effects on the V residual
	R = t(h.pred) - t(h.train) %% Vinv %% k.x_xp

	## new prediction
	# now includes a b.bar offset and y residualized on b.bar
	# expanded form:
	# solution[["mu"]] = h.train %% b.bar + K.xp_x %% Vinv %% (y.train - h %% b.bar)
	# compact form:
	# prediction is now the old prediction plus the fixed effects predicted on the V residual
	solution[["mu"]] = no_fe.mu + t(R) %*% b.bar

	## new covariance
	# we take the old variance, and add the uncertainty in the h
	solution[["var"]] = no_fe.var + t(R) %% solve(Binv + t(h) %% Vinv %% h) %% R
	solution[["bbar"]] = b.bar
	solution[["h"]] = h.pred
	solution[["f_h"]] = h.pred %*% b.bar
	solution[["x"]] = x.pred
	return( solution )
	}

	gp_plot = function( x , y , GP , n.sample=0 , ... ) {
	# Args:
	# x: x-values of training data
	# y: y-values of training data
	# GP: list holding the GP solution
	# GP[["x"]]: the GP test points
	# GP[["mu"]]: the GP prediction
	# GP[["var"]]: the GP prediction covariance
	# GP[["h"]]: the fixed basis points
	# GP[["f_h"]]: the predicted fixed basis points
	# GP[["bbar"]]: the learned \betas

	# scale around confidence intervals
	# yrng = range( c(y,mu.pred + 2sqrt(diag(cov.pred)),mu.pred - 2sqrt(diag(cov.pred))) , na.rm=T )
	# scale around training data (useful for multiple plots of same data)
	# yrng = range( y , na.rm=T ) * 1.05
	# xrng = range( c(x,x.pred) , na.rm=T )

	mu.pred = GP[["mu"]]
	cov.pred = GP[["var"]]
	x.pred = GP[["x"]]

	plot(x,y,type="n",xlab="x",ylab="y" , ... )

	# plot the confidence interval (diagonal of the predicted cov is the variance of each prediction)
	# lines( x.pred , mu.pred + 2*sqrt(diag(cov.pred)) )
	# lines( x.pred , mu.pred - 2*sqrt(diag(cov.pred)) )
	# plot using pretty polygons
	se.pred = 2*sqrt(diag(cov.pred))
	poly.x = c(x.pred[1],x.pred,x.pred[length(x.pred)],rev(x.pred))
	poly.y = c( (mu.pred - se.pred)[1] , mu.pred + se.pred , (mu.pred - se.pred)[length(x.pred)] , rev(mu.pred - se.pred) )
	polygon( poly.x , poly.y , col="#3182bd40" , border="#3182bd50")

	# plot samples from the posterior
	if ( n.sample > 0 ) {
	for (i in 1:n.sample) {
	samp.y = mvrnorm(1, mu.pred, cov.pred)
	lines( x.pred , samp.y , col="#666666")
	}
	}

	# plot the prediction
	lines( x.pred , mu.pred , col="#3182bd50",lwd=2)

	# plot the fixed effects (if any)
	if ( !is.null( GP[["f_h"]] ) ) {
	lines( x.pred , GP[["f_h"]] , col="gray", lwd = 2 )
	}
	# plot the observations
	points(x,y,col="black",pch=19,cex=0.75)
	box()
	}

	### Kernels:
	# linear kernel
	kfunc_linear = function(x1,x2) {
	return( x1 %*% t(x2) )
	}

	# linear * linear = quadtratic
	kfunc_linear2 = function(x1,x2) {
	return( kfunc_linear(x1,x2)^2 )
	}

	# exponential kernel
	kfunc_exp = function(x1,x2) {
	Sigma <- matrix(rep(0, length(x1)*length(x2)), nrow=length(x1))
	for (i in 1:nrow(Sigma)) {
	for (j in 1:ncol(Sigma)) {
	Sigma[i,j] <- exp(-1*(abs(x1[i]-x2[j])))
	}
	}
	return(Sigma)
	}

	# browninan kernel
	kfunc_brown = function(x1,x2,l=1) {
	Sigma <- matrix(rep(0, length(x1)*length(x2)), nrow=length(x1))
	for (i in 1:nrow(Sigma)) {
	for (j in 1:ncol(Sigma)) {
	Sigma[i,j] <- min(x1[i],x2[j])
	}
	}
	return(Sigma)
	}

	# matern kernel
	kfunc_matern = function(x1,x2,l=1) {
	Sigma <- matrix(rep(0, length(x1)*length(x2)), nrow=length(x1))
	for (i in 1:nrow(Sigma)) {
	for (j in 1:ncol(Sigma)) {
	Sigma[i,j] <- (1 + abs(x1[i]-x2[j])) * exp(-1*abs(x1[i]-x2[j]))
	}
	}
	return(Sigma)
	}

	# gaussian kernel
	kfunc_gauss = function(x1,x2,l=1) {
	Sigma <- matrix(rep(0, length(x1)*length(x2)), nrow=length(x1))
	for (i in 1:nrow(Sigma)) {
	for (j in 1:ncol(Sigma)) {
	Sigma[i,j] <- exp(-0.5*(x1[i]-x2[j])^2)
	}
	}
	return(Sigma)
	}

	# sin kernel
	kfunc_sinc = function(x1,x2,l=1) {
	Sigma <- matrix(rep(0, length(x1)*length(x2)), nrow=length(x1))
	for (i in 1:nrow(Sigma)) {
	for (j in 1:ncol(Sigma)) {
	Sigma[i,j] <- sin(abs(x1[i]-x2[j]))/abs(x1[i]-x2[j])
	}
	}
	diag(Sigma) = 1
	return(Sigma)
	}

	### Analyses:

	## mean shift
	x = matrix( rnorm(10,0,2) ,ncol=1)
	x = (x - min(x)) * 4 / (max(x)-min(x)) - 2
	h = matrix(1,ncol=1,nrow=nrow(x))
	y = 10 * h + sin(x)

	yrng = c(0,15)
	xrng = c(-5,5)
	x.test = matrix(seq(-5,5,len=100),nrow=100,ncol=1)
	h.test = matrix(1,ncol=1,nrow=nrow(x.test))

	svg("plot_gp_mean_est.svg",width=7,height=6)
	par(mfrow=c(1,2),cex.axis=0.75,las=1)
	gp = gp_solve_basis( x , y , x.test , matrix(0,ncol=1,nrow=nrow(x)) , matrix(0,ncol=1,nrow=nrow(x.test)) , kfunc_gauss , 0.2)
	gp_plot( x , y , gp , ylim=yrng , xlim=xrng , main="Zero mean",xaxs="i")

	gp = gp_solve_basis( x , y , x.test , h , h.test , kfunc_gauss , 0.2)
	gp_plot( x , y , gp , ylim=yrng , xlim=xrng , main="Inferred mean",xaxs="i")
	dev.off()

	## mixed effects
	svg("plot_gp_basis.svg",width=8,height=8)
	par(mfrow=c(2,2),cex.axis=0.75,las=1)
	yrng = c(-1,30)
	xrng = c(-2,2)

	# linear function
	x = matrix( rnorm(10,0,2) ,ncol=1)
	x = (x - min(x)) * 4 / (max(x)-min(x)) - 2
	h = cbind(rep(1,nrow(x)),x)

	y = h %*% c(10,-5)
	plot( x , y , pch=19 , col="black" , xlab="x" , ylab="y" , main="1: Linear effect\nb = -5", ylim=yrng , cex=0.75 , xlim=xrng)

	# non-linear noise
	u = 2*x^2
	plot( x , u , pch=19 , col="black" , xlab="x" , ylab="y" , main="2: Non-linear effect" , ylim=yrng , cex=0.75 , xlim=xrng)

	# combine the two
	y = y + u
	plot( x , y , pch=19 , col="black" , xlab="x" , ylab="y" , main="3: (1) + (2)" , ylim=yrng , cex=0.75 , xlim=xrng)

	# define some testing points
	x.test = matrix(seq(-4,4,len=100),nrow=100,ncol=1)
	h.test = cbind(rep(1,nrow(x.test)),x.test)

	gp = gp_solve_basis( x , y , x.test , h , h.test , kfunc_linear2 , 0.1 )
	gp_plot( x , y , gp , main=paste("4: GP with inferred basis\nb-est =",round(gp[["bbar"]][2],1)) , ylim=yrng , xlim=xrng)
	dev.off()