CerebralMastication/multivariate random cholesky.R

## multivariate random cholesky.R
# multivariate normal example using Cholesky decomposition

require(Matrix)

#make this reproducible
set.seed(2)

#how many draws in our starting data set?
n <- 1e4

# how many draws do we want from this distribution?
drawCount <- 1e4

myData <- rnorm(n, 5, .6)
yourData <- myData + rnorm(n, 8, .25)
hisData <- myData + rnorm(n, 6, .4)
herData <- hisData + rnorm(n, 8, .35)

ourData <- data.frame(myData, yourData, hisData, herData)

# now we have raw correlations in the 70s, 80s, and 90s. Funny how that
# works out
cor(ourData)

ourData0 <- ourData

# shift the mean of ourData to zero
ourData0 <- as.data.frame(sweep(ourData,2,colMeans(ourData),"-"))

#Cholesky Decomposition of the covariance matrix
C <- chol(nearPD(cov(ourData0))$mat)

#create a matrix of random standard normals
Z <- matrix(rnorm(n * ncol(ourData)), ncol(ourData))

#multiply the standard normals by the transpose of the Cholesky
X <- t(C) %*% Z

#look at the covariances
cov(as.matrix(t(X)))
cov(ourData0)
#woot!

myDraws <- data.frame(as.matrix(t(X)))
names(myDraws) <- names(ourData)

# this method handles the covariance (correlation and standard deviation)
# but we still need to shift the means of the samples.

# shift the mean of the draws over to match the starting data
myDraws <- as.data.frame(sweep(myDraws,2,colMeans(ourData),"+"))


#check the mean and sd
apply(myDraws, 2, mean)
apply(myDraws, 2, sd)

#let's look at the mean and standard dev of the starting data
apply(ourData, 2, mean)
apply(ourData, 2, sd)

# so myDraws contains the final draws
# let's check Kolmogorov-Smirnov between the starting data
# and the final draws

for (i in 1:ncol(ourData)){
   print(ks.test(myDraws[[i]], ourData[[i]]))
}

#look at the correlation matrices
cor(myDraws)
cor(ourData)

#it's fun to plot the variables and see if the PDFs line up
#It's a good sanity check. Using ggplot2 to plot

require(ggplot2)

# rearrange the data to be "tall" not "wide"
meltDraws <-melt(myDraws)
meltDraws$dataSource <- "simulated"
meltData <- melt(ourData)
meltData$dataSource <- "original"
plotData <- rbind(meltData, meltDraws)

qplot(value, colour=dataSource, data=plotData, geom="density")+ facet_wrap(~variable)
	# multivariate normal example using Cholesky decomposition

	require(Matrix)

	#make this reproducible
	set.seed(2)

	#how many draws in our starting data set?
	n <- 1e4

	# how many draws do we want from this distribution?
	drawCount <- 1e4

	myData <- rnorm(n, 5, .6)
	yourData <- myData + rnorm(n, 8, .25)
	hisData <- myData + rnorm(n, 6, .4)
	herData <- hisData + rnorm(n, 8, .35)

	ourData <- data.frame(myData, yourData, hisData, herData)

	# now we have raw correlations in the 70s, 80s, and 90s. Funny how that
	# works out
	cor(ourData)

	ourData0 <- ourData

	# shift the mean of ourData to zero
	ourData0 <- as.data.frame(sweep(ourData,2,colMeans(ourData),"-"))

	#Cholesky Decomposition of the covariance matrix
	C <- chol(nearPD(cov(ourData0))$mat)

	#create a matrix of random standard normals
	Z <- matrix(rnorm(n * ncol(ourData)), ncol(ourData))

	#multiply the standard normals by the transpose of the Cholesky
	X <- t(C) %*% Z

	#look at the covariances
	cov(as.matrix(t(X)))
	cov(ourData0)
	#woot!

	myDraws <- data.frame(as.matrix(t(X)))
	names(myDraws) <- names(ourData)

	# this method handles the covariance (correlation and standard deviation)
	# but we still need to shift the means of the samples.

	# shift the mean of the draws over to match the starting data
	myDraws <- as.data.frame(sweep(myDraws,2,colMeans(ourData),"+"))


	#check the mean and sd
	apply(myDraws, 2, mean)
	apply(myDraws, 2, sd)

	#let's look at the mean and standard dev of the starting data
	apply(ourData, 2, mean)
	apply(ourData, 2, sd)

	# so myDraws contains the final draws
	# let's check Kolmogorov-Smirnov between the starting data
	# and the final draws

	for (i in 1:ncol(ourData)){
	print(ks.test(myDraws[[i]], ourData[[i]]))
	}

	#look at the correlation matrices
	cor(myDraws)
	cor(ourData)

	#it's fun to plot the variables and see if the PDFs line up
	#It's a good sanity check. Using ggplot2 to plot

	require(ggplot2)

	# rearrange the data to be "tall" not "wide"
	meltDraws <-melt(myDraws)
	meltDraws$dataSource <- "simulated"
	meltData <- melt(ourData)
	meltData$dataSource <- "original"
	plotData <- rbind(meltData, meltDraws)

	qplot(value, colour=dataSource, data=plotData, geom="density")+ facet_wrap(~variable)