marcovivero/cvCutoff.R

## cvCutoff.R
# 1. Load packages. Install any packages if necessary.
library(ggplot2)
library(MCMCpack)
library(rootSolve)

# 2. Extract data.
cv = read.csv("~/customerValue.csv", header = F)[, 1]
pcv = cv[which(cv > 0)]
pcvDF = data.frame(pcv)
names(pcvDF) = c("pcv")

# 3. Compute kernel density estimate.

### Select number of nodes to use for functional plotting.
numNodes = 100000

### Compute density data frame.
pcvDensity = density(pcv, from = 0.0001, to = numNodes, n = numNodes)
pcvDensityDF = data.frame(pcvDensity$x, pcvDensity$y)
names(pcvDensityDF) = c("x", "y")


# 4. Compute proportion of users with positive CV scores: 0.03038587.
nPCV = length(pcv)
propPCV = nPCV / length(cv)

# 5. Create first plot.

pcvPlot <- ggplot(data = NULL) +
    labs(
      title = "Empirical Positive CV Score Distribution",
      x ="Positive CV Score (Dollars)",
      y = NULL) +
    theme(plot.title = element_text(face = "bold", size = 25)) +
    theme_bw() +
    geom_histogram(
      aes(x = pcv, y = ..density..),
      data = pcvDF,
      fill = "orange",
      alpha = 1,
      colour = "grey",
      lwd = 0.2,
      binwidth = 25) +
    geom_histogram(
      aes(x = pcv, y = ..density..),
      data = pcvDF,
      fill = "light blue",
      alpha = 0.6,
      colour = "grey",
      lwd = 0.2,
      binwidth = 50) +
    geom_histogram(
      aes(x = pcv, y = ..density..),
      data = pcvDF,
      fill = "grey",
      alpha = 0.2,
      colour = "grey",
      lwd = 0.2,
      binwidth = 100) +
    geom_path(
      aes(x = x, y = y),
      data = pcvDensityDF,
      linetype = 2,
      colour = "black") +
    scale_x_continuous(limits = c(0, numNodes)) +
    coord_cartesian(xlim = c(0, 2500), ylim = c(0, 0.0125))
pcvPlot

# 6. Compute MLE estimates for alpha and beta.

### Initialize MLE equation as a function.
igConstant = -(nPCV * log(mean(1 / pcv)) + sum(log(pcv)))
igMLE = Vectorize(function(x) {
  igConstant + nPCV * (log(x) + digamma(x))
})

### Set up function for checking if parameter estimates are indeed maxima.
maximaCheck = function(alpha, beta) {
  D = (nPCV^2 / beta^2) * (alpha * trigamma(alpha) - 1)
  secDer = -nPCV * trigamma(alpha)

  return(D > 0 & secDer < 0)
}

### Solve MLE equations.
alphaMLE = uniroot.all(igMLE, c(0.00001, 100))
betaMLE = alphaMLE / mean(1 / pcv)

### Make sure we have found a maximum, should return TRUE.
maximaCheck(alphaMLE, betaMLE)

# 7. Compute EMV estimates for alpha and beta.

### Estimate expectation, mode, and variance from data.
e = mean(pcv)
m = pcvDensity$x[which.max(pcvDensity$y)]
v = var(pcv)

### Intialize EMV equation as a function.
igEMV = Vectorize(function(x) {
  out = ((e - m)^2 / 4) * x^4 +
    (-v) * x^3 +
    ((8 * v - (e - m)^2) / 2) * x^2 +
    (-5 * v) * x +
    ((8 * v + (e - m)^2) / 4)
})

### Solve EMV equations.
alphaEMV = uniroot.all(igEMV, c(2, 100))
betaEMV = ((e - m) / 2) * (alphaEMV ^ 2 - 1)

# 8. Get densities.

### Helper function for obtaining inverse-gamma density given a pair of parameters.
getInvGammaDensity = function(alpha, beta) {
  d = Vectorize(function(x) {
    dinvgamma(x, shape = alpha, scale = beta)
  })
  return(d)
}

fittedInvGammaMLE = getInvGammaDensity(alphaMLE, betaMLE)
fittedInvGammaEMVOne = getInvGammaDensity(alphaEMV[1], betaEMV[1])
fittedInvGammaEMVTwo = getInvGammaDensity(alphaEMV[2], betaEMV[2])

# 9. Plot densities.

### Helper function for obtaining data frame plots.
getFunctionDF = function(func) {
  x = seq(0.00001, numNodes, by = 1)
  df = data.frame(x, func(x))
  names(df) = c("x", "y")

  return(df)
}

mleDF = getFunctionDF(fittedInvGammaMLE)
emvOneDF = getFunctionDF(fittedInvGammaEMVOne)
emvTwoDF = getFunctionDF(fittedInvGammaEMVTwo)


dPlot <- ggplot(data = NULL) +
  labs(
    title = "Fitted Densities for Positive CV Score Distribution",
    x ="Positive CV Score (Dollars)",
    y = NULL) +
  theme(plot.title = element_text(face = "bold", size = 25)) +
  theme_bw() +
  geom_histogram(
    aes(x = pcv, y = ..density..),
    data = pcvDF,
    fill = "light blue",
    alpha = 0.5,
    colour = "grey",
    lwd = 0.2,
    binwidth = 25) +
  geom_path(
    aes(x = x, y = y, colour = "1"),
    data = pcvDensityDF,
    lwd = 0.65,
    linetype = 2) +
  geom_path(
    aes(x = x, y = y, colour = "2"),
    data = mleDF,
    lwd = 0.65,
    linetype = 2) +
  geom_path(
    aes(x = x, y = y, colour = "3"),
    data = emvOneDF,
    lwd = 0.65,
    linetype = 2) +
  geom_path(
    aes(x = x, y = y, colour = "4"),
    data = emvTwoDF,
    lwd = 0.65,
    linetype = 2) +
  scale_x_continuous(limits = c(0, numNodes)) +
  coord_cartesian(xlim = c(0, 5000), ylim = c(0, 0.0125)) +
  scale_colour_manual(
    values=c("black", "red", "blue", "green"),
    name="Density Curves",
    breaks = c("1", "2", "3", "4"),
    labels=c(
      "Kernel Density",
      "Inverse-Gamma  MLE Density",
      "Inverse-Gamma EMV 1 Density",
      "Inverse-Gamma EMV 2 Density"))
dPlot

# 10. Fit mixture model.

### Helper function for extracting mixtures.
getMixture = function(pi) {
  mixture = Vectorize(function(x) {
    pi[1] * fittedInvGammaMLE(x) +
    pi[2] * fittedInvGammaEMVOne(x) +
    pi[3] * fittedInvGammaEMVTwo(x)
  })

  return(mixture)
}

### Set up Dirichlet prior and sampler.
dirichletSampler = function(concentrations) {as.numeric(rdirichlet(1, concentrations))}

### Helper function for computing transition probability.
computeLogProbability = function(pi) {
  mixture = getMixture(pi)

  return(sum(log(mixture(pcv))))
}

### Function for running Metropolis-Hastings algorithm.
runMetropolisHastings = function(
  sampleSize,
  initialWeights,
  concentrations = rep(1, 3)) {
  ### Initialize variables.
  piOld = initialWeights
  logProbOld = computeLogProbability(piOld)
  sample = vector("list", length = sampleSize)

  for (i in 1:sampleSize) {
    piCand = dirichletSampler(concentrations)
    logProbCand = computeLogProbability(piCand)
    r = min(exp(logProbCand - logProbOld), 1)
    if (runif(1) < r) {
      sample[[i]] = piCand
      piOld = piCand
      logProbOld = logProbCand
    } else {
      sample[[i]] = piOld
    }

    print(paste0("Sample ", i, ", transition probability: ", r,
                 ", sample: ", sample[[i]][1], ", ", sample[[i]][2],
                 ", candidates: ", piCand[1], ", ", piCand[2]))
  }

  return(t(sapply(sample, as.numeric)))
}

### Start Markov Chain at different simplex vertices and [1/3, 1/3, 1/3].
### I recommed you just use printed report to inspect chain at first, as there is very
### high auto-correlation, and restart chain after modifying concentrations
### to change the sampling envelope.


piStart = c(0.801250452048145, 0.188660534125485, 0.01008901)
concentrations = c(0.80, 0.19, 0.01)* 1500
weightSamples = runMetropolisHastings(150, piStart, concentrations)
sampleMatrix = t(sapply(weightSamples, as.numeric))
weights = colMeans(sampleMatrix) # These are roughly the mixing proportions.

# 11. Plot mixture density.

mixtureDF = getFunctionDF(getMixture(weights))

mPlot <- ggplot(data = NULL) +
  labs(
    title = "Mixture Density for Positive CV Score Distribution",
    x ="Positive CV Score (Dollars)",
    y = NULL) +
  theme(plot.title = element_text(face = "bold", size = 25)) +
  theme_bw() +
  geom_histogram(
    aes(x = pcv, y = ..density..),
    data = pcvDF,
    fill = "light blue",
    alpha = 0.5,
    colour = "grey",
    lwd = 0.2,
    binwidth = 25) +
  geom_path(
    aes(x = x, y = y, colour = "1"),
    data = pcvDensityDF,
    lwd = 0.65,
    linetype = 2) +
  geom_path(
    aes(x = x, y = y, colour = "2"),
    data = mixtureDF,
    lwd = 0.65,
    linetype = 2) +
  scale_x_continuous(limits = c(0, numNodes)) +
  coord_cartesian(xlim = c(0, 2500), ylim = c(0, 0.0125)) +
  scale_colour_manual(
    values=c("black", "red"),
    name="Density Curves",
    breaks = c("1", "2"),
    labels=c(
      "Kernel Density",
      "Inverse-Gamma Mixture Density"))
mPlot

# 12. Compute CV cutoff.

quantileFunc = function(p, func) {
  output = Vectorize(function(x) {
    integrate(func, 0, x)$value - p
  })

  return(output)
}

cvCutoff = uniroot.all(quantileFunc(0.95, getMixture(weights)), c(0.0001, 1000))
	# 1. Load packages. Install any packages if necessary.
	library(ggplot2)
	library(MCMCpack)
	library(rootSolve)

	# 2. Extract data.
	cv = read.csv("~/customerValue.csv", header = F)[, 1]
	pcv = cv[which(cv > 0)]
	pcvDF = data.frame(pcv)
	names(pcvDF) = c("pcv")

	# 3. Compute kernel density estimate.

	### Select number of nodes to use for functional plotting.
	numNodes = 100000

	### Compute density data frame.
	pcvDensity = density(pcv, from = 0.0001, to = numNodes, n = numNodes)
	pcvDensityDF = data.frame(pcvDensity$x, pcvDensity$y)
	names(pcvDensityDF) = c("x", "y")


	# 4. Compute proportion of users with positive CV scores: 0.03038587.
	nPCV = length(pcv)
	propPCV = nPCV / length(cv)

	# 5. Create first plot.

	pcvPlot <- ggplot(data = NULL) +
	labs(
	title = "Empirical Positive CV Score Distribution",
	x ="Positive CV Score (Dollars)",
	y = NULL) +
	theme(plot.title = element_text(face = "bold", size = 25)) +
	theme_bw() +
	geom_histogram(
	aes(x = pcv, y = ..density..),
	data = pcvDF,
	fill = "orange",
	alpha = 1,
	colour = "grey",
	lwd = 0.2,
	binwidth = 25) +
	geom_histogram(
	aes(x = pcv, y = ..density..),
	data = pcvDF,
	fill = "light blue",
	alpha = 0.6,
	colour = "grey",
	lwd = 0.2,
	binwidth = 50) +
	geom_histogram(
	aes(x = pcv, y = ..density..),
	data = pcvDF,
	fill = "grey",
	alpha = 0.2,
	colour = "grey",
	lwd = 0.2,
	binwidth = 100) +
	geom_path(
	aes(x = x, y = y),
	data = pcvDensityDF,
	linetype = 2,
	colour = "black") +
	scale_x_continuous(limits = c(0, numNodes)) +
	coord_cartesian(xlim = c(0, 2500), ylim = c(0, 0.0125))
	pcvPlot

	# 6. Compute MLE estimates for alpha and beta.

	### Initialize MLE equation as a function.
	igConstant = -(nPCV * log(mean(1 / pcv)) + sum(log(pcv)))
	igMLE = Vectorize(function(x) {
	igConstant + nPCV * (log(x) + digamma(x))
	})

	### Set up function for checking if parameter estimates are indeed maxima.
	maximaCheck = function(alpha, beta) {
	D = (nPCV^2 / beta^2) * (alpha * trigamma(alpha) - 1)
	secDer = -nPCV * trigamma(alpha)

	return(D > 0 & secDer < 0)
	}

	### Solve MLE equations.
	alphaMLE = uniroot.all(igMLE, c(0.00001, 100))
	betaMLE = alphaMLE / mean(1 / pcv)

	### Make sure we have found a maximum, should return TRUE.
	maximaCheck(alphaMLE, betaMLE)

	# 7. Compute EMV estimates for alpha and beta.

	### Estimate expectation, mode, and variance from data.
	e = mean(pcv)
	m = pcvDensity$x[which.max(pcvDensity$y)]
	v = var(pcv)

	### Intialize EMV equation as a function.
	igEMV = Vectorize(function(x) {
	out = ((e - m)^2 / 4) * x^4 +
	(-v) * x^3 +
	((8 * v - (e - m)^2) / 2) * x^2 +
	(-5 * v) * x +
	((8 * v + (e - m)^2) / 4)
	})

	### Solve EMV equations.
	alphaEMV = uniroot.all(igEMV, c(2, 100))
	betaEMV = ((e - m) / 2) * (alphaEMV ^ 2 - 1)

	# 8. Get densities.

	### Helper function for obtaining inverse-gamma density given a pair of parameters.
	getInvGammaDensity = function(alpha, beta) {
	d = Vectorize(function(x) {
	dinvgamma(x, shape = alpha, scale = beta)
	})
	return(d)
	}

	fittedInvGammaMLE = getInvGammaDensity(alphaMLE, betaMLE)
	fittedInvGammaEMVOne = getInvGammaDensity(alphaEMV[1], betaEMV[1])
	fittedInvGammaEMVTwo = getInvGammaDensity(alphaEMV[2], betaEMV[2])

	# 9. Plot densities.

	### Helper function for obtaining data frame plots.
	getFunctionDF = function(func) {
	x = seq(0.00001, numNodes, by = 1)
	df = data.frame(x, func(x))
	names(df) = c("x", "y")

	return(df)
	}

	mleDF = getFunctionDF(fittedInvGammaMLE)
	emvOneDF = getFunctionDF(fittedInvGammaEMVOne)
	emvTwoDF = getFunctionDF(fittedInvGammaEMVTwo)


	dPlot <- ggplot(data = NULL) +
	labs(
	title = "Fitted Densities for Positive CV Score Distribution",
	x ="Positive CV Score (Dollars)",
	y = NULL) +
	theme(plot.title = element_text(face = "bold", size = 25)) +
	theme_bw() +
	geom_histogram(
	aes(x = pcv, y = ..density..),
	data = pcvDF,
	fill = "light blue",
	alpha = 0.5,
	colour = "grey",
	lwd = 0.2,
	binwidth = 25) +
	geom_path(
	aes(x = x, y = y, colour = "1"),
	data = pcvDensityDF,
	lwd = 0.65,
	linetype = 2) +
	geom_path(
	aes(x = x, y = y, colour = "2"),
	data = mleDF,
	lwd = 0.65,
	linetype = 2) +
	geom_path(
	aes(x = x, y = y, colour = "3"),
	data = emvOneDF,
	lwd = 0.65,
	linetype = 2) +
	geom_path(
	aes(x = x, y = y, colour = "4"),
	data = emvTwoDF,
	lwd = 0.65,
	linetype = 2) +
	scale_x_continuous(limits = c(0, numNodes)) +
	coord_cartesian(xlim = c(0, 5000), ylim = c(0, 0.0125)) +
	scale_colour_manual(
	values=c("black", "red", "blue", "green"),
	name="Density Curves",
	breaks = c("1", "2", "3", "4"),
	labels=c(
	"Kernel Density",
	"Inverse-Gamma MLE Density",
	"Inverse-Gamma EMV 1 Density",
	"Inverse-Gamma EMV 2 Density"))
	dPlot

	# 10. Fit mixture model.

	### Helper function for extracting mixtures.
	getMixture = function(pi) {
	mixture = Vectorize(function(x) {
	pi[1] * fittedInvGammaMLE(x) +
	pi[2] * fittedInvGammaEMVOne(x) +
	pi[3] * fittedInvGammaEMVTwo(x)
	})

	return(mixture)
	}

	### Set up Dirichlet prior and sampler.
	dirichletSampler = function(concentrations) {as.numeric(rdirichlet(1, concentrations))}

	### Helper function for computing transition probability.
	computeLogProbability = function(pi) {
	mixture = getMixture(pi)

	return(sum(log(mixture(pcv))))
	}

	### Function for running Metropolis-Hastings algorithm.
	runMetropolisHastings = function(
	sampleSize,
	initialWeights,
	concentrations = rep(1, 3)) {
	### Initialize variables.
	piOld = initialWeights
	logProbOld = computeLogProbability(piOld)
	sample = vector("list", length = sampleSize)

	for (i in 1:sampleSize) {
	piCand = dirichletSampler(concentrations)
	logProbCand = computeLogProbability(piCand)
	r = min(exp(logProbCand - logProbOld), 1)
	if (runif(1) < r) {
	sample[[i]] = piCand
	piOld = piCand
	logProbOld = logProbCand
	} else {
	sample[[i]] = piOld
	}

	print(paste0("Sample ", i, ", transition probability: ", r,
	", sample: ", sample[[i]][1], ", ", sample[[i]][2],
	", candidates: ", piCand[1], ", ", piCand[2]))
	}

	return(t(sapply(sample, as.numeric)))
	}

	### Start Markov Chain at different simplex vertices and [1/3, 1/3, 1/3].
	### I recommed you just use printed report to inspect chain at first, as there is very
	### high auto-correlation, and restart chain after modifying concentrations
	### to change the sampling envelope.


	piStart = c(0.801250452048145, 0.188660534125485, 0.01008901)
	concentrations = c(0.80, 0.19, 0.01)* 1500
	weightSamples = runMetropolisHastings(150, piStart, concentrations)
	sampleMatrix = t(sapply(weightSamples, as.numeric))
	weights = colMeans(sampleMatrix) # These are roughly the mixing proportions.

	# 11. Plot mixture density.

	mixtureDF = getFunctionDF(getMixture(weights))

	mPlot <- ggplot(data = NULL) +
	labs(
	title = "Mixture Density for Positive CV Score Distribution",
	x ="Positive CV Score (Dollars)",
	y = NULL) +
	theme(plot.title = element_text(face = "bold", size = 25)) +
	theme_bw() +
	geom_histogram(
	aes(x = pcv, y = ..density..),
	data = pcvDF,
	fill = "light blue",
	alpha = 0.5,
	colour = "grey",
	lwd = 0.2,
	binwidth = 25) +
	geom_path(
	aes(x = x, y = y, colour = "1"),
	data = pcvDensityDF,
	lwd = 0.65,
	linetype = 2) +
	geom_path(
	aes(x = x, y = y, colour = "2"),
	data = mixtureDF,
	lwd = 0.65,
	linetype = 2) +
	scale_x_continuous(limits = c(0, numNodes)) +
	coord_cartesian(xlim = c(0, 2500), ylim = c(0, 0.0125)) +
	scale_colour_manual(
	values=c("black", "red"),
	name="Density Curves",
	breaks = c("1", "2"),
	labels=c(
	"Kernel Density",
	"Inverse-Gamma Mixture Density"))
	mPlot

	# 12. Compute CV cutoff.

	quantileFunc = function(p, func) {
	output = Vectorize(function(x) {
	integrate(func, 0, x)$value - p
	})

	return(output)
	}

	cvCutoff = uniroot.all(quantileFunc(0.95, getMixture(weights)), c(0.0001, 1000))