Skip to content

Instantly share code, notes, and snippets.

@marcovivero
Created January 13, 2016 07:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save marcovivero/228f29d7325140f8cd69 to your computer and use it in GitHub Desktop.
Save marcovivero/228f29d7325140f8cd69 to your computer and use it in GitHub Desktop.
# 1. Load packages. Install any packages if necessary.
library(ggplot2)
library(MCMCpack)
library(rootSolve)
# 2. Extract data.
cv = read.csv("~/customerValue.csv", header = F)[, 1]
pcv = cv[which(cv > 0)]
pcvDF = data.frame(pcv)
names(pcvDF) = c("pcv")
# 3. Compute kernel density estimate.
### Select number of nodes to use for functional plotting.
numNodes = 100000
### Compute density data frame.
pcvDensity = density(pcv, from = 0.0001, to = numNodes, n = numNodes)
pcvDensityDF = data.frame(pcvDensity$x, pcvDensity$y)
names(pcvDensityDF) = c("x", "y")
# 4. Compute proportion of users with positive CV scores: 0.03038587.
nPCV = length(pcv)
propPCV = nPCV / length(cv)
# 5. Create first plot.
pcvPlot <- ggplot(data = NULL) +
labs(
title = "Empirical Positive CV Score Distribution",
x ="Positive CV Score (Dollars)",
y = NULL) +
theme(plot.title = element_text(face = "bold", size = 25)) +
theme_bw() +
geom_histogram(
aes(x = pcv, y = ..density..),
data = pcvDF,
fill = "orange",
alpha = 1,
colour = "grey",
lwd = 0.2,
binwidth = 25) +
geom_histogram(
aes(x = pcv, y = ..density..),
data = pcvDF,
fill = "light blue",
alpha = 0.6,
colour = "grey",
lwd = 0.2,
binwidth = 50) +
geom_histogram(
aes(x = pcv, y = ..density..),
data = pcvDF,
fill = "grey",
alpha = 0.2,
colour = "grey",
lwd = 0.2,
binwidth = 100) +
geom_path(
aes(x = x, y = y),
data = pcvDensityDF,
linetype = 2,
colour = "black") +
scale_x_continuous(limits = c(0, numNodes)) +
coord_cartesian(xlim = c(0, 2500), ylim = c(0, 0.0125))
pcvPlot
# 6. Compute MLE estimates for alpha and beta.
### Initialize MLE equation as a function.
igConstant = -(nPCV * log(mean(1 / pcv)) + sum(log(pcv)))
igMLE = Vectorize(function(x) {
igConstant + nPCV * (log(x) + digamma(x))
})
### Set up function for checking if parameter estimates are indeed maxima.
maximaCheck = function(alpha, beta) {
D = (nPCV^2 / beta^2) * (alpha * trigamma(alpha) - 1)
secDer = -nPCV * trigamma(alpha)
return(D > 0 & secDer < 0)
}
### Solve MLE equations.
alphaMLE = uniroot.all(igMLE, c(0.00001, 100))
betaMLE = alphaMLE / mean(1 / pcv)
### Make sure we have found a maximum, should return TRUE.
maximaCheck(alphaMLE, betaMLE)
# 7. Compute EMV estimates for alpha and beta.
### Estimate expectation, mode, and variance from data.
e = mean(pcv)
m = pcvDensity$x[which.max(pcvDensity$y)]
v = var(pcv)
### Intialize EMV equation as a function.
igEMV = Vectorize(function(x) {
out = ((e - m)^2 / 4) * x^4 +
(-v) * x^3 +
((8 * v - (e - m)^2) / 2) * x^2 +
(-5 * v) * x +
((8 * v + (e - m)^2) / 4)
})
### Solve EMV equations.
alphaEMV = uniroot.all(igEMV, c(2, 100))
betaEMV = ((e - m) / 2) * (alphaEMV ^ 2 - 1)
# 8. Get densities.
### Helper function for obtaining inverse-gamma density given a pair of parameters.
getInvGammaDensity = function(alpha, beta) {
d = Vectorize(function(x) {
dinvgamma(x, shape = alpha, scale = beta)
})
return(d)
}
fittedInvGammaMLE = getInvGammaDensity(alphaMLE, betaMLE)
fittedInvGammaEMVOne = getInvGammaDensity(alphaEMV[1], betaEMV[1])
fittedInvGammaEMVTwo = getInvGammaDensity(alphaEMV[2], betaEMV[2])
# 9. Plot densities.
### Helper function for obtaining data frame plots.
getFunctionDF = function(func) {
x = seq(0.00001, numNodes, by = 1)
df = data.frame(x, func(x))
names(df) = c("x", "y")
return(df)
}
mleDF = getFunctionDF(fittedInvGammaMLE)
emvOneDF = getFunctionDF(fittedInvGammaEMVOne)
emvTwoDF = getFunctionDF(fittedInvGammaEMVTwo)
dPlot <- ggplot(data = NULL) +
labs(
title = "Fitted Densities for Positive CV Score Distribution",
x ="Positive CV Score (Dollars)",
y = NULL) +
theme(plot.title = element_text(face = "bold", size = 25)) +
theme_bw() +
geom_histogram(
aes(x = pcv, y = ..density..),
data = pcvDF,
fill = "light blue",
alpha = 0.5,
colour = "grey",
lwd = 0.2,
binwidth = 25) +
geom_path(
aes(x = x, y = y, colour = "1"),
data = pcvDensityDF,
lwd = 0.65,
linetype = 2) +
geom_path(
aes(x = x, y = y, colour = "2"),
data = mleDF,
lwd = 0.65,
linetype = 2) +
geom_path(
aes(x = x, y = y, colour = "3"),
data = emvOneDF,
lwd = 0.65,
linetype = 2) +
geom_path(
aes(x = x, y = y, colour = "4"),
data = emvTwoDF,
lwd = 0.65,
linetype = 2) +
scale_x_continuous(limits = c(0, numNodes)) +
coord_cartesian(xlim = c(0, 5000), ylim = c(0, 0.0125)) +
scale_colour_manual(
values=c("black", "red", "blue", "green"),
name="Density Curves",
breaks = c("1", "2", "3", "4"),
labels=c(
"Kernel Density",
"Inverse-Gamma MLE Density",
"Inverse-Gamma EMV 1 Density",
"Inverse-Gamma EMV 2 Density"))
dPlot
# 10. Fit mixture model.
### Helper function for extracting mixtures.
getMixture = function(pi) {
mixture = Vectorize(function(x) {
pi[1] * fittedInvGammaMLE(x) +
pi[2] * fittedInvGammaEMVOne(x) +
pi[3] * fittedInvGammaEMVTwo(x)
})
return(mixture)
}
### Set up Dirichlet prior and sampler.
dirichletSampler = function(concentrations) {as.numeric(rdirichlet(1, concentrations))}
### Helper function for computing transition probability.
computeLogProbability = function(pi) {
mixture = getMixture(pi)
return(sum(log(mixture(pcv))))
}
### Function for running Metropolis-Hastings algorithm.
runMetropolisHastings = function(
sampleSize,
initialWeights,
concentrations = rep(1, 3)) {
### Initialize variables.
piOld = initialWeights
logProbOld = computeLogProbability(piOld)
sample = vector("list", length = sampleSize)
for (i in 1:sampleSize) {
piCand = dirichletSampler(concentrations)
logProbCand = computeLogProbability(piCand)
r = min(exp(logProbCand - logProbOld), 1)
if (runif(1) < r) {
sample[[i]] = piCand
piOld = piCand
logProbOld = logProbCand
} else {
sample[[i]] = piOld
}
print(paste0("Sample ", i, ", transition probability: ", r,
", sample: ", sample[[i]][1], ", ", sample[[i]][2],
", candidates: ", piCand[1], ", ", piCand[2]))
}
return(t(sapply(sample, as.numeric)))
}
### Start Markov Chain at different simplex vertices and [1/3, 1/3, 1/3].
### I recommed you just use printed report to inspect chain at first, as there is very
### high auto-correlation, and restart chain after modifying concentrations
### to change the sampling envelope.
piStart = c(0.801250452048145, 0.188660534125485, 0.01008901)
concentrations = c(0.80, 0.19, 0.01)* 1500
weightSamples = runMetropolisHastings(150, piStart, concentrations)
sampleMatrix = t(sapply(weightSamples, as.numeric))
weights = colMeans(sampleMatrix) # These are roughly the mixing proportions.
# 11. Plot mixture density.
mixtureDF = getFunctionDF(getMixture(weights))
mPlot <- ggplot(data = NULL) +
labs(
title = "Mixture Density for Positive CV Score Distribution",
x ="Positive CV Score (Dollars)",
y = NULL) +
theme(plot.title = element_text(face = "bold", size = 25)) +
theme_bw() +
geom_histogram(
aes(x = pcv, y = ..density..),
data = pcvDF,
fill = "light blue",
alpha = 0.5,
colour = "grey",
lwd = 0.2,
binwidth = 25) +
geom_path(
aes(x = x, y = y, colour = "1"),
data = pcvDensityDF,
lwd = 0.65,
linetype = 2) +
geom_path(
aes(x = x, y = y, colour = "2"),
data = mixtureDF,
lwd = 0.65,
linetype = 2) +
scale_x_continuous(limits = c(0, numNodes)) +
coord_cartesian(xlim = c(0, 2500), ylim = c(0, 0.0125)) +
scale_colour_manual(
values=c("black", "red"),
name="Density Curves",
breaks = c("1", "2"),
labels=c(
"Kernel Density",
"Inverse-Gamma Mixture Density"))
mPlot
# 12. Compute CV cutoff.
quantileFunc = function(p, func) {
output = Vectorize(function(x) {
integrate(func, 0, x)$value - p
})
return(output)
}
cvCutoff = uniroot.all(quantileFunc(0.95, getMixture(weights)), c(0.0001, 1000))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment