public
Last active

  • Download Gist
StochasticExample.R
R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
# example of how to use a Guassian Copula to create random draws from
# normally distributed data
 
require("reshape")
require("plyr")
require("QRMlib")
require("Matrix")
 
#make this reproducable
set.seed(2)
 
#how many draws in our starting data set?
n <- 1e4
 
# how many draws do we want from this distribution?
drawCount <- 1e4
 
# ourData will be my starting data which we'll
# base our model on
myData <- rnorm(n, 5, .6)
yourData <- myData + rnorm(n, 8, .25)
hisData <- myData + rnorm(n, 6, .4)
herData <- hisData + rnorm(n, 8, .35)
 
ourData <- data.frame(myData, yourData, hisData, herData)
 
# now we have raw correlations in the 70s, 80s, and 90s. Funny how that
# works out
cor(ourData)
 
#set up some simple functions for normalizing (Z Score) and
#denormalizing
normalMoments <- function(t) {
as.list(c(mean=mean(t), sd=sd(t)))
}
 
normalize <- function(x) {
(x - mean(x)) / sd(x)
}
 
deNormalize <- function(x, sampleMean, sampleSd) {
(x * sampleSd + sampleMean)
}
 
 
# create an object with the mean and sd of each
# of the margins... you can do this without plyr
# using apply() if you're into that sort of thing
# but alply makes it so easy to work with the results
# we will use these later to denormalize
normalMomentList <- alply(ourData, 2, normalMoments)
 
# normalize i.e. create Z score
# I think you don't HAVE to do this with gaussian marginals,
# but you DO with non-gaussian... so I'm in the habit.
ourDataZ <- data.frame(apply(ourData, 2, normalize))
 
# now ourDataZ is a data frame of Z scores.
# prove this to yourself by checking the
# mean and SD of each margin
 
apply(ourDataZ, 2, mean)
apply(ourDataZ, 2, sd)
 
#looks like mean of 0 and sd of 1 to me
 
# this fixes positive def, fits a copula, and makes draws
# in one line of code. Try that in Excel, bitches.
myDraws <- rcopula.gauss(n=drawCount,
Sigma=as.matrix(nearPD(Spearman(ourDataZ),corr=TRUE)$mat),
d=ncol(ourData))
 
# rcopula.gauss spits out points from 0-1 (i.e. q values) so we need to turn those into
# Z scores by doing a little norm inversion.
myDraws <- qnorm(myDraws)
 
#check the mean and sd
apply(myDraws, 2, mean)
apply(myDraws, 2, sd)
 
#should be 0, 1... and they are close..
 
# but we can do better than that... we know statistics!
# let's do a Kolmogorov-Smirnov test to see if these
# samples come from the same distribution
# KS does not check correlation,
# it only tests if two sets of samples
# came from same dist.. we'll check each column
 
for (i in 1:ncol(ourData)){
print(ks.test(myDraws[[i]], ourDataZ[[i]]))
}
 
# if the p-value of the KS test is < .05 then we
# reject that the distributions are equal
# they all look > .05 to me.
# Kolmogorov-Smirnov makes me want to drink, for some odd reason
# If your starting sample is small, you'll notice that a couple of the variables
# fail or just barely pass the KS test. This is common because KS is non-parametric and
# the starting sample will be 'lumpy' and not that big. If starting n gets up
# to, say, 10K, then they all do better. That's why I started with 10K and still
# it can be hit or miss
 
# so we have a metric shit ton of random draws. But they are Z scores
# and we want them put back in their original shapes. So let's do that:
 
myDrawsDenorm <- myDraws
for (i in 1:ncol(myDrawsDenorm)) {
myDrawsDenorm[,i] <- deNormalize(myDraws[,i],
normalMomentList[[i]][[1]],
normalMomentList[[i]][[2]])
}
 
myDrawsDenorm <- data.frame(myDrawsDenorm)
names(myDrawsDenorm) <- names(ourData)
 
#let's look at the mean and standard dev of the starting data
apply(ourData, 2, mean)
apply(ourData, 2, sd)
 
#compare that with our sample data
apply(myDrawsDenorm, 2, mean)
apply(myDrawsDenorm, 2, sd)
 
 
# so myDrawsDenorm contains the final draws
# let's check Kolmogorov-Smirnov between the starting data
# and the final draws
 
for (i in 1:ncol(ourData)){
print(ks.test(myDrawsDenorm[[i]], ourData[[i]]))
}
 
# holy shit. it works!
 
#look at the correlation matrices
cor(myDrawsDenorm)
cor(ourData)
 
#it's fun to plot the variables and see if the PDFs line up
#you could do this for each variable. It's a good sanity check.
plot(density(myDrawsDenorm$myData))
lines(density(ourData$myData), col="red")
 
# there's a test to see if the corr matricices are the same
# but I'm too lazy to google for it

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.