Skip to content

Instantly share code, notes, and snippets.

@pavelk2
Created June 23, 2015 12:18
Show Gist options
  • Save pavelk2/3e06eb51ed4831f47297 to your computer and use it in GitHub Desktop.
Save pavelk2/3e06eb51ed4831f47297 to your computer and use it in GitHub Desktop.
estimate-normal-distribution-from-small-sample-with-rankings
#
# Negative log likelihood for parameters `theta`, based on a subset `y` from
# a sample of `n` iid values, associated with ranks in `m`.
#
Lambda <- function(theta, y, m, n) {
mu <- theta[1]; sigma <- theta[2]
powers <- diff(c(0, m, n+1)) - 1
gaps <- diff(c(0, pnorm(y, mu, sigma), 1))
z <- dnorm(y, mu, sigma)
#a <- lgamma(n+1) - sum(lgamma(powers+1)) # Log multinomial coefficient
b <- sum(log(z)) + sum(powers*log(gaps))
return (-b)
}
#
# Return the Weibull probability plotting points for `n` values.
#
pp <- function(n) {
mid <- c()
if (n >= 3) mid <- (2:(n-1) - 0.3175)/(n + 0.365)
u <- (1/2)^(1/n)
c(1 - u, mid, u)
}
#
# ROS fit.
#
ROS <- function(y, m, n) {
sigma <- qnorm(pp(n)[m])
return (coef(lm(y ~ sigma)))
}
#
# MLE fit (estimates only).
#
MLE <- function(y, m, n) {
theta <- ROS(y, m, n) # Use ROS for an initial estimate
fit <- optim(theta, Lambda, y=y, m=m, n=n)
z <- c(fit$par, theta)
names(z) <- c("mu.MLE", "sigma.MLE", "mu.ROS", "sigma.ROS")
return (z) # Return MLE *and* ROS estimates
}
#
# Generate a dataset from a known Normal distribution.
#
n <- 100
m <- sort(c(98, 59, 27)) # The ranks to select
set.seed(17) # Allow reproducible results
x <- sort(rnorm(n)) # Sort the data once and for all
y <- x[m] # All fitting is based only on `y`, `m`, and `n`
#
# Fit using both methods.
#
fit <- MLE(y, m, n)
mu.hat <- fit[1]; sigma.hat <- fit[2]
#
# Plot the MLE fit.
#
pp.x <- qnorm(pp(n), mu.hat, sigma.hat)
pp.y <- pp.x[m]
plot(pp.x, x, col="Gray", xlab="Fit", ylab="Values",
main="Simulated Data, n=100")
abline(c(0,1), col="Black", lty=3)
points(pp.y, y, pch=16, col="#d03020", cex=1.25)
#
# Peform a Monte-Carlo assessment of the bias and variance of both.
#
system.time({ # Takes about 4 seconds per 1000 iterations
sim <- replicate(1e3, {
x <- sort(rnorm(n))
y <- x[m]
MLE(y, m, n)
})
})
rowMeans(sim) # Average estimates of the parameters
apply(sim, 1, sd) # Standard deviations of those estimates
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment