marcovivero/cvCutoff.R

## cvCutoff.R
# 1. Load packages. Install any packages if necessary.
library(ggplot2)
library(MCMCpack)
library(rootSolve)

# 2. Extract data.
cv = read.csv("<FILE_PATH>/customerValue.csv", header = F)[, 1]
pcv = cv[which(cv > 0)]
pcvDF = data.frame(pcv)
names(pcvDF) = c("pcv")

### Data histogram.
ggplot(data = pcvDF) +
  labs(
    title = "Positive CV Score Histogram",
    x ="CV Score (Dollars)",
    y = NULL) +
  theme(plot.title = element_text(face = "bold", size = 25)) +
  theme_bw() +
  geom_histogram(
    aes(x = pcv),
    fill = "orange",
    alpha = 1,
    colour = "grey",
    lwd = 0.5,
    binwidth = 25) +
  coord_cartesian(xlim = c(0, 1000))

# 3. From blog post, we set p to be 0.9
p = 0.9

# 4. Computing CV Cutoff using empirical quantiles.

### Function for computing pth quantile of
computeQuantile = function(x) {
  quantile(x, c(p))[1]
}

### Compute the pth quantile of positive CV score data: 514.938
pcvQuantile = computeQuantile(pcv)

### Function for getting sample of size n with replacement.
getSample = function(n) {
  sample(pcv, n, replace = T)
}

### Function for computing boostrap sample for a numerical statistic.
getBootstrapEstimates = function(data, B, n, func) {
  samples = numeric(B)
  for (i in 1:B) {
    bootstrapSample = getSample(n)
    samples[i] = func(bootstrapSample)
  }

  return(samples)
}

### Get bootstrap sample for pth quantile estimates using B = 2500, n = 5000.
B = 5000
n = 7500
### Generate bootstrap quantile estimates.
quantileEstimates = getBootstrapEstimates(pcv, B, n, computeQuantile)
### Sorted estimates with actual positive CV score quantile.
sortedEstimatesWithPCVQuantile = sort(c(quantileEstimates, pcvQuantile))
### Get estimate mean quantile q with respect to rest of bootstrap estimates.
q = which(sortedEstimatesWithPCVQuantile == pcvQuantile)[1]/ (B + 1)
upperQuantile = min(q + (0.95 / 2), 1)
lowerQuantile = max(q - (0.95 / 2), 0)
confidenceInterval = quantile(quantileEstimates, c(lowerQuantile, upperQuantile))

### Confidence interval plot.
quantileEstimatesDF = data.frame(quantileEstimates)
names(quantileEstimatesDF)[1] = "x"
confidenceDF = data.frame(
  c(
    rep(confidenceInterval[1], 2),
    rep(confidenceInterval[2], 2),
    confidenceInterval[1]),
  c(0, 1200, 1200, 0, 0))
names(confidenceDF) = c("x", "y")

### Create bootstrap estimate plot with shaded confidence region.
ggplot() +
  labs(
    title = "Empirical Quantile Bootsrap Estimates With Shaded Confidence Set",
    x ="90% Quantile Estimate",
    y = NULL) +
  theme(plot.title = element_text(face = "bold", size = 25)) +
  theme_bw() +
  geom_histogram(
    aes(x = x),
    data = quantileEstimatesDF,
    fill = "orange",
    alpha = 1,
    colour = "grey",
    lwd = 0.2,
    binwidth = 3) +
  geom_polygon(
    aes(x = x, y = y),
    data = confidenceDF,
    fill = "red",
    lwd = 0,
    alpha = 0.5) +
  geom_vline(xintercept = pcvQuantile, colour = "blue") +
  coord_cartesian(xlim = c(400, 550))

### Compute quantile using kernel density estimates.
densityQuantile = ewcdf(pcv, c(p))
	# 1. Load packages. Install any packages if necessary.
	library(ggplot2)
	library(MCMCpack)
	library(rootSolve)

	# 2. Extract data.
	cv = read.csv("<FILE_PATH>/customerValue.csv", header = F)[, 1]
	pcv = cv[which(cv > 0)]
	pcvDF = data.frame(pcv)
	names(pcvDF) = c("pcv")

	### Data histogram.
	ggplot(data = pcvDF) +
	labs(
	title = "Positive CV Score Histogram",
	x ="CV Score (Dollars)",
	y = NULL) +
	theme(plot.title = element_text(face = "bold", size = 25)) +
	theme_bw() +
	geom_histogram(
	aes(x = pcv),
	fill = "orange",
	alpha = 1,
	colour = "grey",
	lwd = 0.5,
	binwidth = 25) +
	coord_cartesian(xlim = c(0, 1000))

	# 3. From blog post, we set p to be 0.9
	p = 0.9

	# 4. Computing CV Cutoff using empirical quantiles.

	### Function for computing pth quantile of
	computeQuantile = function(x) {
	quantile(x, c(p))[1]
	}

	### Compute the pth quantile of positive CV score data: 514.938
	pcvQuantile = computeQuantile(pcv)

	### Function for getting sample of size n with replacement.
	getSample = function(n) {
	sample(pcv, n, replace = T)
	}

	### Function for computing boostrap sample for a numerical statistic.
	getBootstrapEstimates = function(data, B, n, func) {
	samples = numeric(B)
	for (i in 1:B) {
	bootstrapSample = getSample(n)
	samples[i] = func(bootstrapSample)
	}

	return(samples)
	}

	### Get bootstrap sample for pth quantile estimates using B = 2500, n = 5000.
	B = 5000
	n = 7500
	### Generate bootstrap quantile estimates.
	quantileEstimates = getBootstrapEstimates(pcv, B, n, computeQuantile)
	### Sorted estimates with actual positive CV score quantile.
	sortedEstimatesWithPCVQuantile = sort(c(quantileEstimates, pcvQuantile))
	### Get estimate mean quantile q with respect to rest of bootstrap estimates.
	q = which(sortedEstimatesWithPCVQuantile == pcvQuantile)[1]/ (B + 1)
	upperQuantile = min(q + (0.95 / 2), 1)
	lowerQuantile = max(q - (0.95 / 2), 0)
	confidenceInterval = quantile(quantileEstimates, c(lowerQuantile, upperQuantile))

	### Confidence interval plot.
	quantileEstimatesDF = data.frame(quantileEstimates)
	names(quantileEstimatesDF)[1] = "x"
	confidenceDF = data.frame(
	c(
	rep(confidenceInterval[1], 2),
	rep(confidenceInterval[2], 2),
	confidenceInterval[1]),
	c(0, 1200, 1200, 0, 0))
	names(confidenceDF) = c("x", "y")

	### Create bootstrap estimate plot with shaded confidence region.
	ggplot() +
	labs(
	title = "Empirical Quantile Bootsrap Estimates With Shaded Confidence Set",
	x ="90% Quantile Estimate",
	y = NULL) +
	theme(plot.title = element_text(face = "bold", size = 25)) +
	theme_bw() +
	geom_histogram(
	aes(x = x),
	data = quantileEstimatesDF,
	fill = "orange",
	alpha = 1,
	colour = "grey",
	lwd = 0.2,
	binwidth = 3) +
	geom_polygon(
	aes(x = x, y = y),
	data = confidenceDF,
	fill = "red",
	lwd = 0,
	alpha = 0.5) +
	geom_vline(xintercept = pcvQuantile, colour = "blue") +
	coord_cartesian(xlim = c(400, 550))

	### Compute quantile using kernel density estimates.
	densityQuantile = ewcdf(pcv, c(p))