Skip to content

Instantly share code, notes, and snippets.

@WinstonCampeau
Created October 27, 2018 03:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save WinstonCampeau/4912a7ad0f681f780295838d65c0bcd8 to your computer and use it in GitHub Desktop.
Save WinstonCampeau/4912a7ad0f681f780295838d65c0bcd8 to your computer and use it in GitHub Desktop.
R - Catchall (A plethora of 2D model analysis)
CATCHALL <- function(NUM_RESAMPLES, KFOLDS, INDIVIDUAL_PLOTS) {
if(nargs()==0){
NUM_RESAMPLES <- 100
KFOLDS <- 5
INDIVIDUAL_PLOTS <- 0
}
cat("Please select your data; The data must be (X_COLUMN, Y_COLUMN) and csv file type\nIf you did not enter a number of bootstrap resamples, the default is set to 100 - double for jackknife - and defaults to 5 kfolds")
cat("\n")
cat("\n")
data <- read.csv(file.choose(), header=T)
XD <- as.vector(data[[1]])
YD <- as.vector(data[[2]])
SXY <- sum((XD - mean(XD))*(YD-mean(YD)))
SXX <- sum((XD-mean(XD))^2)
SYY <- sum((YD-mean(YD))^2)
names <- colnames(data)
EST_B1 <- (data[[2]][length(YD)]-data[[2]][1])/(data[[1]][length(XD)]-data[[1]][1])
EST_B0 <- mean(YD)-(EST_B1*mean(XD))
#Computes LS I
MI_B1 <- SXY/SXX
MI_B0 <- mean(YD)-(MI_B1*mean(XD))
YHATS <- MI_B0 + MI_B1*(XD)
RSS_MI <- sum((YHATS - YD)^2)
MI_B0_STDERROR <- sqrt((RSS_MI/(nrow(data)-2))*((1/nrow(data))+((mean(XD)^2)/SXX)))
MI_B1_STDERROR <- sqrt((RSS_MI/(nrow(data)-2))/SXX)
#Computed a LAD LS
LAD <- function(A) {
EY = A[1] + A[2]*XD
ABSOLUTE_DEVIATION = abs(EY-YD)
SUM_DEVIATIONS = sum(ABSOLUTE_DEVIATION)
}
LAD_SOLUTION <- optim(c(EST_B0, EST_B1), LAD)
LAD_RSS <- sum((YD-(LAD_SOLUTION[[1]][1]+(LAD_SOLUTION[[1]][2]*XD)))^2)
#LAD_B0_STDERROR <- sqrt((LAD_RSS/(nrow(data)-2))*((1/nrow(data))+((mean(XD)^2)/SXX)))
#LAD_B1_STDERROR <- sqrt((LAD_RSS/(nrow(data)-2))/SXX)
#LAD_RSQR <- (LAD_SOLUTION[[1]][2]*SXY)/SYY
#Computes a ML LS (I make a number of assumptions about ML based on its equivalency to OLS)
MAXIMUM_LIKELIHOOD <- function(B) {
B0 <- B[1]
B1 <- B[2]
YHAT <- B0 + B1*XD
RESIDUALS <- YD-YHAT
LOGFX <- dnorm(RESIDUALS, mean(RESIDUALS), sd=sd(RESIDUALS), log=T)
lnL <- -sum(LOGFX)
}
MAXIMUM_LIKELIHOOD_SOLUTION <- try(optim(c(MI_B0,MI_B1), MAXIMUM_LIKELIHOOD), silent=T)
#ML_RSS <- sum((YD-(MAXIMUM_LIKELIHOOD_SOLUTION[[1]][1]+MAXIMUM_LIKELIHOOD_SOLUTION[[2]][2]*XD))^2)
if("try-error" %in% class(MAXIMUM_LIKELIHOOD_SOLUTION)) {
MAXIMUM_LIKELIHOOD_SOLUTION <- LAD_SOLUTION
MAXIMUM_LIKELIHOOD_SOLUTION[[1]][1] <- MI_B0
MAXIMUM_LIKELIHOOD_SOLUTION[[1]][2] <- MI_B1
ML_B0_STDERROR <- MI_B0_STDERROR
ML_B1_STDERROR <- MI_B1_STDERROR
} else ML_B0_STDERROR <- sqrt((abs(MAXIMUM_LIKELIHOOD_SOLUTION[[2]])/(nrow(data)-2))*((1/nrow(data))+((mean(XD)^2)/SXX))); ML_B1_STDERROR <- sqrt((abs(MAXIMUM_LIKELIHOOD_SOLUTION[[2]])/(nrow(data)-2))/SXX)
#############################################################################
#OLD Computed LS II
#MODELII <- function(C) {
#EY = C[1] + C[2]*XD
#RESIDUALS = ((YD-EY)^2)/(1+(C[2]^2))
#SUM_RESIDUALS <- sum(RESIDUALS)
#}
#MODELII_SOLUTION <- optim(c(EST_B0, EST_B1), MODELII)
#MODELII_RSS <- sum(((YD-(MODELII_SOLUTION[[1]][1]+MODELII_SOLUTION[[1]][2]*XD))^2)/(1+(MODELII_SOLUTION[[1]][2]^2)))
#MII <- (MODELII_SOLUTION[[1]][2]*SXY)/SYY
#MII_B0_STDERROR <- sqrt((MODELII_RSS/(nrow(data)-2))*((1/nrow(data))+((mean(XD)^2)/SXX)))
#MII_B1_STDERROR <- sqrt((MODELII_RSS/(nrow(data)-2))/SXX)
#############################################################################
#Model II analytical solution (Modified Code from MBARI)
n = length(XD);
#Calculate sums and other re-used expressions
Sx = sum(XD);
Sy = sum(YD);
xbar = Sx/n;
ybar = Sy/n;
U = XD - xbar;
V = YD - ybar;
Suv = sum(U * V);
Su2 = sum(U^2);
Sv2 = sum(V^2);
sigx = sqrt(Su2/(n-1));
sigy = sqrt(Sv2/(n-1));
#Calculate m, b, r, sm, and sb (slope, intercept, r, and errors of slope and intercept)
m = (Sv2 - Su2 + sqrt(((Sv2 - Su2)^2) + (4 * Suv^2)))/(2 * Suv);
b = ybar - m * xbar;
r = Suv / sqrt(Su2 * Sv2);
sm = (m/r) * sqrt((1 - r^2)/n);
sb1 = (sigy - sigx * m)^2;
sb2 = (2 * sigx * sigy) + ((xbar^2 * m * (1 + r))/r^2);
sb = sqrt((sb1 + ((1 - r) * m * sb2))/n);
#Computed boostrap, any function can easily be passed through the bootstrapping code
LIST_B0S <- c()
LIST_B1S <- c()
LIST_B1S_LADBOOT <- c()
LIST_B0S_LADBOOT <- c()
LADB0 <- LAD_SOLUTION[[1]][1]
LADB1 <- LAD_SOLUTION[[1]][2]
for(iter in 1:NUM_RESAMPLES){
BOOT_X <- c()
BOOT_SAMPLE <- sample(YD, replace=T)
for(i in 1:length(BOOT_SAMPLE)) {
for(j in 1:length(YD)) {
if(BOOT_SAMPLE[i]==YD[j]){
BOOT_X[i] <- XD[j]
}
}
}
LAD_BOOT <- function(boot) {
EY = boot[1] + boot[2]*BOOT_X
ABSOLUTE_DEVIATION = abs(EY-BOOT_SAMPLE)
SUM_DEVIATIONS = sum(ABSOLUTE_DEVIATION)
}
LAD_BOOT_SOLUTION <- optim(c(LADB0, LADB1), LAD_BOOT)
SXY_BOOT <- sum((BOOT_X - mean(BOOT_X))*(BOOT_SAMPLE - mean(BOOT_SAMPLE)))
SXX_BOOT <- sum((BOOT_X - mean(BOOT_X))^2)
B1_BOOT <- SXY_BOOT/SXX_BOOT
B0_BOOT <- mean(BOOT_SAMPLE - (B1_BOOT*mean(BOOT_X)))
LIST_B1S[iter] <- B1_BOOT
LIST_B0S[iter] <- B0_BOOT
#For a LAD STD ERROR estimation
LIST_B1S_LADBOOT[iter] <- LAD_BOOT_SOLUTION[[1]][2]
LIST_B0S_LADBOOT[iter] <- LAD_BOOT_SOLUTION[[1]][1]
}
BOOTSTRAP_B0 <- mean(LIST_B0S)
BOOTSTRAP_B1 <- mean(LIST_B1S)
BOOT_RESIDUALS <- (YD - (BOOTSTRAP_B0 + XD*BOOTSTRAP_B1))^2
#SUM_BR <- sum(BOOT_RESIDUALS)
#BOOTSTRAP_B0_STDERROR <- sqrt((SUM_BR/(nrow(data)-2))*((1/nrow(data))+((mean(XD)^2)/SXX)))
#BOOTSTRAP_B1_STDERROR <- sqrt((SUM_BR/(nrow(data)-2))/SXX)
if(sum(BOOT_RESIDUALS==0)){
BOOTSTRAP_B0_STDERROR <- 0
} else BOOTSTRAP_B0_STDERROR <- sd(LIST_B0S)
BOOTSTRAP_B1_STDERROR <- sd(LIST_B1S)
LAD_B0_STDERROR <- sd(LIST_B0S_LADBOOT)
LAD_B1_STDERROR <- sd(LIST_B1S_LADBOOT)
#Computes Jackknife, similar to bootstrap any function can easily be passed through here as well
LIST_J1S <- c()
LIST_J0S <- c()
for(j in 1:(2*NUM_RESAMPLES)) {
JACK_X <- c()
if(j %% 2 == 0) {
JACK_SAMPLE <- sample(YD, floor(length(YD)/2), replace=F)
} else JACK_SAMPLE <- sample(YD, ceiling(length(YD)/2), replace=F)
for(k in 1:length(JACK_SAMPLE)) {
for(f in 1:length(YD)) {
if(JACK_SAMPLE[k]==YD[f]) {
JACK_X[k] <- XD[f]
}
}
}
SXY_JACK <- sum((JACK_X - mean(JACK_X))*(JACK_SAMPLE - mean(JACK_SAMPLE)))
SXX_JACK <- sum((JACK_X - mean(JACK_X))^2)
B1_JACK <- SXY_JACK/SXX_JACK
B0_JACK <- mean(JACK_SAMPLE) - (B1_JACK*(mean(JACK_X)))
LIST_J1S[j] <- B1_JACK
LIST_J0S[j] <- B0_JACK
}
JACK_B1 <- mean(LIST_J1S)
JACK_B0 <- mean(LIST_J0S)
JACK_B1_STDERROR <- sd(LIST_J1S)
JACK_B0_STDERROR <- sd(LIST_J0S)
#Computes K-Fold Cross Validation (For LS only)
#Shuffles XD and YD
K_XD<-c()
K_YD <- sample(YD, length(YD), replace = F)
for(i in 1:length(K_YD)) {
for(j in 1:length(XD)) {
if(K_YD[i]==YD[j]){
K_XD[i] <- XD[j]
}
}
}
#This defines the breaks of the set in to k parts
SEQ <- seq_along(K_XD)
MAX <- length(K_XD)/KFOLDS
SPLITS_X <- split(K_XD, ceiling(SEQ/MAX))
SPLITS_Y <- split(K_YD, ceiling(SEQ/MAX))
#This sets a common index for future iterations
UNION_X <- union(SPLITS_X, SPLITS_X)
UNION_Y <- union(SPLITS_Y, SPLITS_Y)
LIST_K_B1 <- c()
LIST_K_B0 <- c()
K_MII_B0 <- c()
K_MII_B1 <- c()
LIST_K_RSS <- c()
LIST_K_MII_RSS <- c()
for(i in 1:length(SPLITS_X)) {
K_SETS_X <- setdiff(UNION_X, SPLITS_X[i])
K_SET_X <- unlist(K_SETS_X)
K_SETS_Y <- setdiff(UNION_Y, SPLITS_Y[i])
K_SET_Y <- unlist(K_SETS_Y)
K_SXY <- sum((K_SET_X - mean(K_SET_X))*(K_SET_Y-mean(K_SET_Y)))
K_SXX <- sum((K_SET_X-mean(K_SET_X))^2)
K_B1 <- K_SXY/K_SXX
K_B0 <- mean(K_SET_Y) - K_B1*mean(K_SET_X)
K_TEST_Y <- setdiff(UNION_Y, K_SETS_Y)
K_TEST_X <- setdiff(UNION_X, K_SETS_X)
K_RSS <- sum((K_TEST_Y[[1]] - (K_B0 + K_B1*K_TEST_X[[1]]))^2)
LIST_K_RSS[i] <- K_RSS
LIST_K_B0[i] <- K_B0
LIST_K_B1[i] <- K_B1
################## MII ####################
n = length(K_XD);
#Calculate sums and other re-used expressions
Sx = sum(K_XD);
Sy = sum(K_YD);
xbar = Sx/n;
ybar = Sy/n;
U = XD - xbar;
V = YD - ybar;
Suv = sum(U * V);
Su2 = sum(U^2);
Sv2 = sum(V^2);
sigx = sqrt(Su2/(n-1));
sigy = sqrt(Sv2/(n-1));
#Calculate m, b, r, sm, and sb (slope, intercept, r, and errors of slope and intercept)
m = (Sv2 - Su2 + sqrt(((Sv2 - Su2)^2) + (4 * Suv^2)))/(2 * Suv);
b = ybar - m * xbar;
r = Suv / sqrt(Su2 * Sv2);
sm = (m/r) * sqrt((1 - r^2)/n);
sb1 = (sigy - sigx * m)^2;
sb2 = (2 * sigx * sigy) + ((xbar^2 * m * (1 + r))/r^2);
sb = sqrt((sb1 + ((1 - r) * m * sb2))/n);
K_MII_RSS <- sum(((K_TEST_Y[[1]] - (b + m*K_TEST_X[[1]]))^2)/(1+ (m^2)))
LIST_K_MII_RSS[i] <- K_MII_RSS
K_MII_B0[i] <- b
K_MII_B1[i] <- m
}
##### K MI #####
K_B0 <- mean(LIST_K_B0)
K_B1 <- mean(LIST_K_B1)
K_RSS <- mean(LIST_K_RSS)
K_CHECK <- 1-(abs(K_RSS-RSS_MI)/RSS_MI)
K_B0_STDERROR <- sd(LIST_K_B0)
K_B1_STDERROR <- sd(LIST_K_B1)
##### K MII #####
K_MII_B0_STDERROR <- sd(K_MII_B0)
K_MII_B1_STDERROR <- sd(K_MII_B1)
K_MII_RSS <- mean(LIST_K_MII_RSS)
K_MII_CHECK <- 1-(abs(K_MII_RSS-RSS_MI)/RSS_MI)
K_MII_B0 <- mean(K_MII_B0)
K_MII_B1 <- mean(K_MII_B1)
#Outputs begin here
if(INDIVIDUAL_PLOTS==0){
par(mfrow=c(1,1))
colours <- c("red", "cornflowerblue", "green", "purple", "orange", "deeppink", "tomato", "black")
models <- c("LAD", "ML", "MII LS", paste("BS OLS, n=",NUM_RESAMPLES), paste("JACK OLS, n=", 2*NUM_RESAMPLES), paste("KFOLD OLS, k=", KFOLDS), paste("KFOLD MII LS, k=", KFOLDS), "MI LS")
plot(YD~XD, ylab=names[2], xlab=names[1])
abline(LAD_SOLUTION[[1]][1], LAD_SOLUTION[[1]][2], col="red")
abline(MAXIMUM_LIKELIHOOD_SOLUTION[[1]][1], MAXIMUM_LIKELIHOOD_SOLUTION[[1]][2], col="cornflowerblue")
abline(b, m, col="green")
abline(BOOTSTRAP_B0, BOOTSTRAP_B1, col="purple")
abline(JACK_B0, JACK_B1, col="orange")
abline(K_B0, K_B1, col="deeppink")
abline(K_MII_B0, K_MII_B1, col="tomato")
abline(MI_B0, MI_B1, col="black")
legend("topleft", inset=0.02, legend=models, col=colours, lty=1, cex=0.7)
}
if(INDIVIDUAL_PLOTS==1){
par(mfrow=c(2,4))
plot(YD~XD, ylab=names[2], xlab=names[1], main="LAD")
abline(LAD_SOLUTION[[1]][1], LAD_SOLUTION[[1]][2], col="red")
plot(YD~XD, ylab=names[2], xlab=names[1], main="ML")
abline(MAXIMUM_LIKELIHOOD_SOLUTION[[1]][1], MAXIMUM_LIKELIHOOD_SOLUTION[[1]][2], col="cornflowerblue")
plot(YD~XD, ylab=names[2], xlab=names[1], main="MII LS")
abline(b, m, col="green")
plot(YD~XD, ylab=names[2], xlab=names[1], main=paste("BS OLS, n=", NUM_RESAMPLES))
abline(BOOTSTRAP_B0, BOOTSTRAP_B1, col="purple")
plot(YD~XD, ylab=names[2], xlab=names[1], main=paste("JACK OLS, n=", 2*NUM_RESAMPLES))
abline(JACK_B0, JACK_B1, col="orange")
plot(YD~XD, ylab=names[2], xlab=names[1], main=paste("KFOLD OLS, k=", KFOLDS))
abline(K_B0, K_B1, col="deeppink")
plot(YD~XD, ylab=names[2], xlab=names[1], main=paste("KFOLD MII LS, k=", KFOLDS))
abline(K_MII_B0, K_MII_B1, col="tomato")
plot(YD~XD, ylab=names[2], xlab=names[1], main="MI LS")
abline(MI_B0, MI_B1, col="black")
}
ESTIMATOR_NAMES <- c("LAD_B0", "LAD_B1", "ML_B0", "ML_B1", "MI_B0", "MI_B1", "MII_B0", "MII_B1", "BS_OLS_B0", "BS_OLS_B1", "JACK_OLS_B0", "JACK_OLS_B1", "KFOLD_OLS_B0", "KFOLD_OLS_B1", "KFOLD_MII_LS_B0", "KFOLD_MII_LS_B1")
ESTIMATOR_VALUES <- c(LAD_SOLUTION[[1]][1], LAD_SOLUTION[[1]][2], MAXIMUM_LIKELIHOOD_SOLUTION[[1]][1], MAXIMUM_LIKELIHOOD_SOLUTION[[1]][2], MI_B0, MI_B1, b, m, BOOTSTRAP_B0, BOOTSTRAP_B1, JACK_B0, JACK_B1, K_B0, K_B1, K_MII_B0, K_MII_B1)
ESTIMATE_STD_ERROR <- c(LAD_B0_STDERROR, LAD_B1_STDERROR, ML_B0_STDERROR, ML_B1_STDERROR, MI_B0_STDERROR, MI_B1_STDERROR, sb, sm, BOOTSTRAP_B0_STDERROR, BOOTSTRAP_B1_STDERROR, JACK_B0_STDERROR, JACK_B1_STDERROR, K_B0_STDERROR, K_B1_STDERROR, K_MII_B0_STDERROR, K_MII_B1_STDERROR)
#Iterate t-values and cooncomitant p-values
T_VALUES <- c()
P_VALUES <- c()
GOOD_P <- c()
for(s in 1:length(ESTIMATE_STD_ERROR)){
if(ESTIMATE_STD_ERROR[s]==0){
#Instead of infinity I set these values arbitrarily high to get p=~ 0
T_VALUES[s] <- 100000000
} else T_VALUES[s] <- ESTIMATOR_VALUES[s]/ESTIMATE_STD_ERROR[s]
if(T_VALUES[s] < 0){
P_VALUES[s] <- 2*pt(T_VALUES[s],nrow(data)-2,lower.tail=TRUE)
} else P_VALUES[s] <- 2*pt(T_VALUES[s],nrow(data)-2,lower.tail=FALSE)
ifelse(P_VALUES[s]<0.0000005, GOOD_P[s] <- "***",
ifelse(P_VALUES[s]<0.0005, GOOD_P[s] <- "**",
ifelse(P_VALUES[s]<0.05, GOOD_P[s] <- "*",
ifelse(P_VALUES[s]>=0.05, GOOD_P[s] <- "")
)
)
)
}
#Lazy selection for 'BEST' model (min p values)
B0_P <- c()
B1_P <- c()
FOR_MODEL_SELECTION <- c(rep("", length(P_VALUES)))
for(p in 1:length(P_VALUES)) {
if(p %% 2 == 0) {
B1_P[p] <- P_VALUES[p]
} else B0_P[p] <- P_VALUES[p]
}
MIN_B0_P <- min(B0_P[which(!is.na(B0_P))])
MIN_B1_P <- min(B1_P[which(!is.na(B1_P))])
for(id in 1:length(P_VALUES)){
if(MIN_B0_P == P_VALUES[id]){
FOR_MODEL_SELECTION[id] <- "BEST B0"
}
if(MIN_B1_P == P_VALUES[id]){
FOR_MODEL_SELECTION[id] <- "BEST B1"
}
if(sum(P_VALUES)==P_VALUES[1]*length(P_VALUES)){
FOR_MODEL_SELECTION[id] <- "ALL MODELS SUITABLE"
}
}
#Get the confidence intervals
CONF_INTS <- c()
for(v in 1:length(ESTIMATOR_VALUES)){
CONF_INTS[v] <- (qt(0.975, nrow(data)-2, lower.tail=TRUE))*ESTIMATE_STD_ERROR[v]
}
table_output <- data.frame(Model_Estimator=ESTIMATOR_NAMES, Estimate=round(ESTIMATOR_VALUES, 4), Confidence=paste("+/-", round(CONF_INTS,4)), Std_Error=round(ESTIMATE_STD_ERROR,4), t_value=round(T_VALUES, 4), p_value=round(P_VALUES, 6), sig=GOOD_P, Model_Selection=FOR_MODEL_SELECTION)
print(table_output)
cat("\n")
cat("for sig: p<0.0000005 = ***, p<0.0005 = **, p<0.05 = *")
cat("\n")
cat("Model selection only considers the lowest p-values for B0 and B1")
cat("\n")
cat("KFOLD cross validation check outputs RSS %difference of", K_CHECK, "for MI LS")
cat("\n")
cat("KFOLD cross validation check outputs RSS %difference of", K_MII_CHECK, "for MII LS")
cat("\n")
PRINT_ME <- c()
for(i in 1:length(ESTIMATE_STD_ERROR)){
if(ESTIMATE_STD_ERROR[i]==0){
PRINT_ME[i] <- i
}
}
try(if(sum(PRINT_ME)==(length(ESTIMATE_STD_ERROR)*(length(ESTIMATE_STD_ERROR)+1)/2)){
cat("\n")
cat("Your values may be perfectly correlated, your t-values have been set arbitrarily large as they are approaching infinity")
}, silent=TRUE)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment