alexeygrigorev/kldiv.R

## kldiv.R
install.packages('hflights', 'entropy')

library(hflights)
library(entropy)


sample.size = 10000
reps = 5000

mean(hflights$ArrDelay, na.rm=T)

bootstrap.kl = function(data, sample.size, reps) {
  thetas = replicate(reps,
                     mean(sample(data, size=sample.size), na.rm=T))

  h1 = hist(thetas, breaks=18, probability=T)
  bootstrap.sample = sample(data, size=sample.size)

  thetas.hat =
    replicate(reps,
              mean(sample(bootstrap.sample, size=sample.size, replace=T),
                   na.rm=T))

  h2 = hist(thetas.hat, breaks=18)

  minval = min(c(thetas, thetas.hat))
  maxval = max(c(thetas, thetas.hat))
  xlim = c(minval, maxval)

  plot(h1, col=rgb(0,0,1,1/4), xlim=xlim)
  plot(h2, col=rgb(1,0,0,1/4), add=T)
  legend('topright', c('true', 'bootstrap'),
         col=c(rgb(0,0,1,0.5), rgb(1,0,0,0.5)),
         pch=15)

  bins = seq(minval, maxval, length=50)
  h1table = table(cut(thetas, bins))
  h2table = table(cut(thetas.hat, bins))

  dist = KL.empirical(h1table + 1, h2table + 1)
  return(dist)
}


hist(hflights$ArrDelay)
bootstrap.kl(hflights$ArrDelay, sample.size=5000, reps=5000)
	install.packages('hflights', 'entropy')

	library(hflights)
	library(entropy)


	sample.size = 10000
	reps = 5000

	mean(hflights$ArrDelay, na.rm=T)

	bootstrap.kl = function(data, sample.size, reps) {
	thetas = replicate(reps,
	mean(sample(data, size=sample.size), na.rm=T))

	h1 = hist(thetas, breaks=18, probability=T)
	bootstrap.sample = sample(data, size=sample.size)

	thetas.hat =
	replicate(reps,
	mean(sample(bootstrap.sample, size=sample.size, replace=T),
	na.rm=T))

	h2 = hist(thetas.hat, breaks=18)

	minval = min(c(thetas, thetas.hat))
	maxval = max(c(thetas, thetas.hat))
	xlim = c(minval, maxval)

	plot(h1, col=rgb(0,0,1,1/4), xlim=xlim)
	plot(h2, col=rgb(1,0,0,1/4), add=T)
	legend('topright', c('true', 'bootstrap'),
	col=c(rgb(0,0,1,0.5), rgb(1,0,0,0.5)),
	pch=15)

	bins = seq(minval, maxval, length=50)
	h1table = table(cut(thetas, bins))
	h2table = table(cut(thetas.hat, bins))

	dist = KL.empirical(h1table + 1, h2table + 1)
	return(dist)
	}


	hist(hflights$ArrDelay)
	bootstrap.kl(hflights$ArrDelay, sample.size=5000, reps=5000)