Created
October 27, 2018 03:40
-
-
Save WinstonCampeau/4912a7ad0f681f780295838d65c0bcd8 to your computer and use it in GitHub Desktop.
R - Catchall (A plethora of 2D model analysis)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CATCHALL <- function(NUM_RESAMPLES, KFOLDS, INDIVIDUAL_PLOTS) { | |
if(nargs()==0){ | |
NUM_RESAMPLES <- 100 | |
KFOLDS <- 5 | |
INDIVIDUAL_PLOTS <- 0 | |
} | |
cat("Please select your data; The data must be (X_COLUMN, Y_COLUMN) and csv file type\nIf you did not enter a number of bootstrap resamples, the default is set to 100 - double for jackknife - and defaults to 5 kfolds") | |
cat("\n") | |
cat("\n") | |
data <- read.csv(file.choose(), header=T) | |
XD <- as.vector(data[[1]]) | |
YD <- as.vector(data[[2]]) | |
SXY <- sum((XD - mean(XD))*(YD-mean(YD))) | |
SXX <- sum((XD-mean(XD))^2) | |
SYY <- sum((YD-mean(YD))^2) | |
names <- colnames(data) | |
EST_B1 <- (data[[2]][length(YD)]-data[[2]][1])/(data[[1]][length(XD)]-data[[1]][1]) | |
EST_B0 <- mean(YD)-(EST_B1*mean(XD)) | |
#Computes LS I | |
MI_B1 <- SXY/SXX | |
MI_B0 <- mean(YD)-(MI_B1*mean(XD)) | |
YHATS <- MI_B0 + MI_B1*(XD) | |
RSS_MI <- sum((YHATS - YD)^2) | |
MI_B0_STDERROR <- sqrt((RSS_MI/(nrow(data)-2))*((1/nrow(data))+((mean(XD)^2)/SXX))) | |
MI_B1_STDERROR <- sqrt((RSS_MI/(nrow(data)-2))/SXX) | |
#Computed a LAD LS | |
LAD <- function(A) { | |
EY = A[1] + A[2]*XD | |
ABSOLUTE_DEVIATION = abs(EY-YD) | |
SUM_DEVIATIONS = sum(ABSOLUTE_DEVIATION) | |
} | |
LAD_SOLUTION <- optim(c(EST_B0, EST_B1), LAD) | |
LAD_RSS <- sum((YD-(LAD_SOLUTION[[1]][1]+(LAD_SOLUTION[[1]][2]*XD)))^2) | |
#LAD_B0_STDERROR <- sqrt((LAD_RSS/(nrow(data)-2))*((1/nrow(data))+((mean(XD)^2)/SXX))) | |
#LAD_B1_STDERROR <- sqrt((LAD_RSS/(nrow(data)-2))/SXX) | |
#LAD_RSQR <- (LAD_SOLUTION[[1]][2]*SXY)/SYY | |
#Computes a ML LS (I make a number of assumptions about ML based on its equivalency to OLS) | |
MAXIMUM_LIKELIHOOD <- function(B) { | |
B0 <- B[1] | |
B1 <- B[2] | |
YHAT <- B0 + B1*XD | |
RESIDUALS <- YD-YHAT | |
LOGFX <- dnorm(RESIDUALS, mean(RESIDUALS), sd=sd(RESIDUALS), log=T) | |
lnL <- -sum(LOGFX) | |
} | |
MAXIMUM_LIKELIHOOD_SOLUTION <- try(optim(c(MI_B0,MI_B1), MAXIMUM_LIKELIHOOD), silent=T) | |
#ML_RSS <- sum((YD-(MAXIMUM_LIKELIHOOD_SOLUTION[[1]][1]+MAXIMUM_LIKELIHOOD_SOLUTION[[2]][2]*XD))^2) | |
if("try-error" %in% class(MAXIMUM_LIKELIHOOD_SOLUTION)) { | |
MAXIMUM_LIKELIHOOD_SOLUTION <- LAD_SOLUTION | |
MAXIMUM_LIKELIHOOD_SOLUTION[[1]][1] <- MI_B0 | |
MAXIMUM_LIKELIHOOD_SOLUTION[[1]][2] <- MI_B1 | |
ML_B0_STDERROR <- MI_B0_STDERROR | |
ML_B1_STDERROR <- MI_B1_STDERROR | |
} else ML_B0_STDERROR <- sqrt((abs(MAXIMUM_LIKELIHOOD_SOLUTION[[2]])/(nrow(data)-2))*((1/nrow(data))+((mean(XD)^2)/SXX))); ML_B1_STDERROR <- sqrt((abs(MAXIMUM_LIKELIHOOD_SOLUTION[[2]])/(nrow(data)-2))/SXX) | |
############################################################################# | |
#OLD Computed LS II | |
#MODELII <- function(C) { | |
#EY = C[1] + C[2]*XD | |
#RESIDUALS = ((YD-EY)^2)/(1+(C[2]^2)) | |
#SUM_RESIDUALS <- sum(RESIDUALS) | |
#} | |
#MODELII_SOLUTION <- optim(c(EST_B0, EST_B1), MODELII) | |
#MODELII_RSS <- sum(((YD-(MODELII_SOLUTION[[1]][1]+MODELII_SOLUTION[[1]][2]*XD))^2)/(1+(MODELII_SOLUTION[[1]][2]^2))) | |
#MII <- (MODELII_SOLUTION[[1]][2]*SXY)/SYY | |
#MII_B0_STDERROR <- sqrt((MODELII_RSS/(nrow(data)-2))*((1/nrow(data))+((mean(XD)^2)/SXX))) | |
#MII_B1_STDERROR <- sqrt((MODELII_RSS/(nrow(data)-2))/SXX) | |
############################################################################# | |
#Model II analytical solution (Modified Code from MBARI) | |
n = length(XD); | |
#Calculate sums and other re-used expressions | |
Sx = sum(XD); | |
Sy = sum(YD); | |
xbar = Sx/n; | |
ybar = Sy/n; | |
U = XD - xbar; | |
V = YD - ybar; | |
Suv = sum(U * V); | |
Su2 = sum(U^2); | |
Sv2 = sum(V^2); | |
sigx = sqrt(Su2/(n-1)); | |
sigy = sqrt(Sv2/(n-1)); | |
#Calculate m, b, r, sm, and sb (slope, intercept, r, and errors of slope and intercept) | |
m = (Sv2 - Su2 + sqrt(((Sv2 - Su2)^2) + (4 * Suv^2)))/(2 * Suv); | |
b = ybar - m * xbar; | |
r = Suv / sqrt(Su2 * Sv2); | |
sm = (m/r) * sqrt((1 - r^2)/n); | |
sb1 = (sigy - sigx * m)^2; | |
sb2 = (2 * sigx * sigy) + ((xbar^2 * m * (1 + r))/r^2); | |
sb = sqrt((sb1 + ((1 - r) * m * sb2))/n); | |
#Computed boostrap, any function can easily be passed through the bootstrapping code | |
LIST_B0S <- c() | |
LIST_B1S <- c() | |
LIST_B1S_LADBOOT <- c() | |
LIST_B0S_LADBOOT <- c() | |
LADB0 <- LAD_SOLUTION[[1]][1] | |
LADB1 <- LAD_SOLUTION[[1]][2] | |
for(iter in 1:NUM_RESAMPLES){ | |
BOOT_X <- c() | |
BOOT_SAMPLE <- sample(YD, replace=T) | |
for(i in 1:length(BOOT_SAMPLE)) { | |
for(j in 1:length(YD)) { | |
if(BOOT_SAMPLE[i]==YD[j]){ | |
BOOT_X[i] <- XD[j] | |
} | |
} | |
} | |
LAD_BOOT <- function(boot) { | |
EY = boot[1] + boot[2]*BOOT_X | |
ABSOLUTE_DEVIATION = abs(EY-BOOT_SAMPLE) | |
SUM_DEVIATIONS = sum(ABSOLUTE_DEVIATION) | |
} | |
LAD_BOOT_SOLUTION <- optim(c(LADB0, LADB1), LAD_BOOT) | |
SXY_BOOT <- sum((BOOT_X - mean(BOOT_X))*(BOOT_SAMPLE - mean(BOOT_SAMPLE))) | |
SXX_BOOT <- sum((BOOT_X - mean(BOOT_X))^2) | |
B1_BOOT <- SXY_BOOT/SXX_BOOT | |
B0_BOOT <- mean(BOOT_SAMPLE - (B1_BOOT*mean(BOOT_X))) | |
LIST_B1S[iter] <- B1_BOOT | |
LIST_B0S[iter] <- B0_BOOT | |
#For a LAD STD ERROR estimation | |
LIST_B1S_LADBOOT[iter] <- LAD_BOOT_SOLUTION[[1]][2] | |
LIST_B0S_LADBOOT[iter] <- LAD_BOOT_SOLUTION[[1]][1] | |
} | |
BOOTSTRAP_B0 <- mean(LIST_B0S) | |
BOOTSTRAP_B1 <- mean(LIST_B1S) | |
BOOT_RESIDUALS <- (YD - (BOOTSTRAP_B0 + XD*BOOTSTRAP_B1))^2 | |
#SUM_BR <- sum(BOOT_RESIDUALS) | |
#BOOTSTRAP_B0_STDERROR <- sqrt((SUM_BR/(nrow(data)-2))*((1/nrow(data))+((mean(XD)^2)/SXX))) | |
#BOOTSTRAP_B1_STDERROR <- sqrt((SUM_BR/(nrow(data)-2))/SXX) | |
if(sum(BOOT_RESIDUALS==0)){ | |
BOOTSTRAP_B0_STDERROR <- 0 | |
} else BOOTSTRAP_B0_STDERROR <- sd(LIST_B0S) | |
BOOTSTRAP_B1_STDERROR <- sd(LIST_B1S) | |
LAD_B0_STDERROR <- sd(LIST_B0S_LADBOOT) | |
LAD_B1_STDERROR <- sd(LIST_B1S_LADBOOT) | |
#Computes Jackknife, similar to bootstrap any function can easily be passed through here as well | |
LIST_J1S <- c() | |
LIST_J0S <- c() | |
for(j in 1:(2*NUM_RESAMPLES)) { | |
JACK_X <- c() | |
if(j %% 2 == 0) { | |
JACK_SAMPLE <- sample(YD, floor(length(YD)/2), replace=F) | |
} else JACK_SAMPLE <- sample(YD, ceiling(length(YD)/2), replace=F) | |
for(k in 1:length(JACK_SAMPLE)) { | |
for(f in 1:length(YD)) { | |
if(JACK_SAMPLE[k]==YD[f]) { | |
JACK_X[k] <- XD[f] | |
} | |
} | |
} | |
SXY_JACK <- sum((JACK_X - mean(JACK_X))*(JACK_SAMPLE - mean(JACK_SAMPLE))) | |
SXX_JACK <- sum((JACK_X - mean(JACK_X))^2) | |
B1_JACK <- SXY_JACK/SXX_JACK | |
B0_JACK <- mean(JACK_SAMPLE) - (B1_JACK*(mean(JACK_X))) | |
LIST_J1S[j] <- B1_JACK | |
LIST_J0S[j] <- B0_JACK | |
} | |
JACK_B1 <- mean(LIST_J1S) | |
JACK_B0 <- mean(LIST_J0S) | |
JACK_B1_STDERROR <- sd(LIST_J1S) | |
JACK_B0_STDERROR <- sd(LIST_J0S) | |
#Computes K-Fold Cross Validation (For LS only) | |
#Shuffles XD and YD | |
K_XD<-c() | |
K_YD <- sample(YD, length(YD), replace = F) | |
for(i in 1:length(K_YD)) { | |
for(j in 1:length(XD)) { | |
if(K_YD[i]==YD[j]){ | |
K_XD[i] <- XD[j] | |
} | |
} | |
} | |
#This defines the breaks of the set in to k parts | |
SEQ <- seq_along(K_XD) | |
MAX <- length(K_XD)/KFOLDS | |
SPLITS_X <- split(K_XD, ceiling(SEQ/MAX)) | |
SPLITS_Y <- split(K_YD, ceiling(SEQ/MAX)) | |
#This sets a common index for future iterations | |
UNION_X <- union(SPLITS_X, SPLITS_X) | |
UNION_Y <- union(SPLITS_Y, SPLITS_Y) | |
LIST_K_B1 <- c() | |
LIST_K_B0 <- c() | |
K_MII_B0 <- c() | |
K_MII_B1 <- c() | |
LIST_K_RSS <- c() | |
LIST_K_MII_RSS <- c() | |
for(i in 1:length(SPLITS_X)) { | |
K_SETS_X <- setdiff(UNION_X, SPLITS_X[i]) | |
K_SET_X <- unlist(K_SETS_X) | |
K_SETS_Y <- setdiff(UNION_Y, SPLITS_Y[i]) | |
K_SET_Y <- unlist(K_SETS_Y) | |
K_SXY <- sum((K_SET_X - mean(K_SET_X))*(K_SET_Y-mean(K_SET_Y))) | |
K_SXX <- sum((K_SET_X-mean(K_SET_X))^2) | |
K_B1 <- K_SXY/K_SXX | |
K_B0 <- mean(K_SET_Y) - K_B1*mean(K_SET_X) | |
K_TEST_Y <- setdiff(UNION_Y, K_SETS_Y) | |
K_TEST_X <- setdiff(UNION_X, K_SETS_X) | |
K_RSS <- sum((K_TEST_Y[[1]] - (K_B0 + K_B1*K_TEST_X[[1]]))^2) | |
LIST_K_RSS[i] <- K_RSS | |
LIST_K_B0[i] <- K_B0 | |
LIST_K_B1[i] <- K_B1 | |
################## MII #################### | |
n = length(K_XD); | |
#Calculate sums and other re-used expressions | |
Sx = sum(K_XD); | |
Sy = sum(K_YD); | |
xbar = Sx/n; | |
ybar = Sy/n; | |
U = XD - xbar; | |
V = YD - ybar; | |
Suv = sum(U * V); | |
Su2 = sum(U^2); | |
Sv2 = sum(V^2); | |
sigx = sqrt(Su2/(n-1)); | |
sigy = sqrt(Sv2/(n-1)); | |
#Calculate m, b, r, sm, and sb (slope, intercept, r, and errors of slope and intercept) | |
m = (Sv2 - Su2 + sqrt(((Sv2 - Su2)^2) + (4 * Suv^2)))/(2 * Suv); | |
b = ybar - m * xbar; | |
r = Suv / sqrt(Su2 * Sv2); | |
sm = (m/r) * sqrt((1 - r^2)/n); | |
sb1 = (sigy - sigx * m)^2; | |
sb2 = (2 * sigx * sigy) + ((xbar^2 * m * (1 + r))/r^2); | |
sb = sqrt((sb1 + ((1 - r) * m * sb2))/n); | |
K_MII_RSS <- sum(((K_TEST_Y[[1]] - (b + m*K_TEST_X[[1]]))^2)/(1+ (m^2))) | |
LIST_K_MII_RSS[i] <- K_MII_RSS | |
K_MII_B0[i] <- b | |
K_MII_B1[i] <- m | |
} | |
##### K MI ##### | |
K_B0 <- mean(LIST_K_B0) | |
K_B1 <- mean(LIST_K_B1) | |
K_RSS <- mean(LIST_K_RSS) | |
K_CHECK <- 1-(abs(K_RSS-RSS_MI)/RSS_MI) | |
K_B0_STDERROR <- sd(LIST_K_B0) | |
K_B1_STDERROR <- sd(LIST_K_B1) | |
##### K MII ##### | |
K_MII_B0_STDERROR <- sd(K_MII_B0) | |
K_MII_B1_STDERROR <- sd(K_MII_B1) | |
K_MII_RSS <- mean(LIST_K_MII_RSS) | |
K_MII_CHECK <- 1-(abs(K_MII_RSS-RSS_MI)/RSS_MI) | |
K_MII_B0 <- mean(K_MII_B0) | |
K_MII_B1 <- mean(K_MII_B1) | |
#Outputs begin here | |
if(INDIVIDUAL_PLOTS==0){ | |
par(mfrow=c(1,1)) | |
colours <- c("red", "cornflowerblue", "green", "purple", "orange", "deeppink", "tomato", "black") | |
models <- c("LAD", "ML", "MII LS", paste("BS OLS, n=",NUM_RESAMPLES), paste("JACK OLS, n=", 2*NUM_RESAMPLES), paste("KFOLD OLS, k=", KFOLDS), paste("KFOLD MII LS, k=", KFOLDS), "MI LS") | |
plot(YD~XD, ylab=names[2], xlab=names[1]) | |
abline(LAD_SOLUTION[[1]][1], LAD_SOLUTION[[1]][2], col="red") | |
abline(MAXIMUM_LIKELIHOOD_SOLUTION[[1]][1], MAXIMUM_LIKELIHOOD_SOLUTION[[1]][2], col="cornflowerblue") | |
abline(b, m, col="green") | |
abline(BOOTSTRAP_B0, BOOTSTRAP_B1, col="purple") | |
abline(JACK_B0, JACK_B1, col="orange") | |
abline(K_B0, K_B1, col="deeppink") | |
abline(K_MII_B0, K_MII_B1, col="tomato") | |
abline(MI_B0, MI_B1, col="black") | |
legend("topleft", inset=0.02, legend=models, col=colours, lty=1, cex=0.7) | |
} | |
if(INDIVIDUAL_PLOTS==1){ | |
par(mfrow=c(2,4)) | |
plot(YD~XD, ylab=names[2], xlab=names[1], main="LAD") | |
abline(LAD_SOLUTION[[1]][1], LAD_SOLUTION[[1]][2], col="red") | |
plot(YD~XD, ylab=names[2], xlab=names[1], main="ML") | |
abline(MAXIMUM_LIKELIHOOD_SOLUTION[[1]][1], MAXIMUM_LIKELIHOOD_SOLUTION[[1]][2], col="cornflowerblue") | |
plot(YD~XD, ylab=names[2], xlab=names[1], main="MII LS") | |
abline(b, m, col="green") | |
plot(YD~XD, ylab=names[2], xlab=names[1], main=paste("BS OLS, n=", NUM_RESAMPLES)) | |
abline(BOOTSTRAP_B0, BOOTSTRAP_B1, col="purple") | |
plot(YD~XD, ylab=names[2], xlab=names[1], main=paste("JACK OLS, n=", 2*NUM_RESAMPLES)) | |
abline(JACK_B0, JACK_B1, col="orange") | |
plot(YD~XD, ylab=names[2], xlab=names[1], main=paste("KFOLD OLS, k=", KFOLDS)) | |
abline(K_B0, K_B1, col="deeppink") | |
plot(YD~XD, ylab=names[2], xlab=names[1], main=paste("KFOLD MII LS, k=", KFOLDS)) | |
abline(K_MII_B0, K_MII_B1, col="tomato") | |
plot(YD~XD, ylab=names[2], xlab=names[1], main="MI LS") | |
abline(MI_B0, MI_B1, col="black") | |
} | |
ESTIMATOR_NAMES <- c("LAD_B0", "LAD_B1", "ML_B0", "ML_B1", "MI_B0", "MI_B1", "MII_B0", "MII_B1", "BS_OLS_B0", "BS_OLS_B1", "JACK_OLS_B0", "JACK_OLS_B1", "KFOLD_OLS_B0", "KFOLD_OLS_B1", "KFOLD_MII_LS_B0", "KFOLD_MII_LS_B1") | |
ESTIMATOR_VALUES <- c(LAD_SOLUTION[[1]][1], LAD_SOLUTION[[1]][2], MAXIMUM_LIKELIHOOD_SOLUTION[[1]][1], MAXIMUM_LIKELIHOOD_SOLUTION[[1]][2], MI_B0, MI_B1, b, m, BOOTSTRAP_B0, BOOTSTRAP_B1, JACK_B0, JACK_B1, K_B0, K_B1, K_MII_B0, K_MII_B1) | |
ESTIMATE_STD_ERROR <- c(LAD_B0_STDERROR, LAD_B1_STDERROR, ML_B0_STDERROR, ML_B1_STDERROR, MI_B0_STDERROR, MI_B1_STDERROR, sb, sm, BOOTSTRAP_B0_STDERROR, BOOTSTRAP_B1_STDERROR, JACK_B0_STDERROR, JACK_B1_STDERROR, K_B0_STDERROR, K_B1_STDERROR, K_MII_B0_STDERROR, K_MII_B1_STDERROR) | |
#Iterate t-values and cooncomitant p-values | |
T_VALUES <- c() | |
P_VALUES <- c() | |
GOOD_P <- c() | |
for(s in 1:length(ESTIMATE_STD_ERROR)){ | |
if(ESTIMATE_STD_ERROR[s]==0){ | |
#Instead of infinity I set these values arbitrarily high to get p=~ 0 | |
T_VALUES[s] <- 100000000 | |
} else T_VALUES[s] <- ESTIMATOR_VALUES[s]/ESTIMATE_STD_ERROR[s] | |
if(T_VALUES[s] < 0){ | |
P_VALUES[s] <- 2*pt(T_VALUES[s],nrow(data)-2,lower.tail=TRUE) | |
} else P_VALUES[s] <- 2*pt(T_VALUES[s],nrow(data)-2,lower.tail=FALSE) | |
ifelse(P_VALUES[s]<0.0000005, GOOD_P[s] <- "***", | |
ifelse(P_VALUES[s]<0.0005, GOOD_P[s] <- "**", | |
ifelse(P_VALUES[s]<0.05, GOOD_P[s] <- "*", | |
ifelse(P_VALUES[s]>=0.05, GOOD_P[s] <- "") | |
) | |
) | |
) | |
} | |
#Lazy selection for 'BEST' model (min p values) | |
B0_P <- c() | |
B1_P <- c() | |
FOR_MODEL_SELECTION <- c(rep("", length(P_VALUES))) | |
for(p in 1:length(P_VALUES)) { | |
if(p %% 2 == 0) { | |
B1_P[p] <- P_VALUES[p] | |
} else B0_P[p] <- P_VALUES[p] | |
} | |
MIN_B0_P <- min(B0_P[which(!is.na(B0_P))]) | |
MIN_B1_P <- min(B1_P[which(!is.na(B1_P))]) | |
for(id in 1:length(P_VALUES)){ | |
if(MIN_B0_P == P_VALUES[id]){ | |
FOR_MODEL_SELECTION[id] <- "BEST B0" | |
} | |
if(MIN_B1_P == P_VALUES[id]){ | |
FOR_MODEL_SELECTION[id] <- "BEST B1" | |
} | |
if(sum(P_VALUES)==P_VALUES[1]*length(P_VALUES)){ | |
FOR_MODEL_SELECTION[id] <- "ALL MODELS SUITABLE" | |
} | |
} | |
#Get the confidence intervals | |
CONF_INTS <- c() | |
for(v in 1:length(ESTIMATOR_VALUES)){ | |
CONF_INTS[v] <- (qt(0.975, nrow(data)-2, lower.tail=TRUE))*ESTIMATE_STD_ERROR[v] | |
} | |
table_output <- data.frame(Model_Estimator=ESTIMATOR_NAMES, Estimate=round(ESTIMATOR_VALUES, 4), Confidence=paste("+/-", round(CONF_INTS,4)), Std_Error=round(ESTIMATE_STD_ERROR,4), t_value=round(T_VALUES, 4), p_value=round(P_VALUES, 6), sig=GOOD_P, Model_Selection=FOR_MODEL_SELECTION) | |
print(table_output) | |
cat("\n") | |
cat("for sig: p<0.0000005 = ***, p<0.0005 = **, p<0.05 = *") | |
cat("\n") | |
cat("Model selection only considers the lowest p-values for B0 and B1") | |
cat("\n") | |
cat("KFOLD cross validation check outputs RSS %difference of", K_CHECK, "for MI LS") | |
cat("\n") | |
cat("KFOLD cross validation check outputs RSS %difference of", K_MII_CHECK, "for MII LS") | |
cat("\n") | |
PRINT_ME <- c() | |
for(i in 1:length(ESTIMATE_STD_ERROR)){ | |
if(ESTIMATE_STD_ERROR[i]==0){ | |
PRINT_ME[i] <- i | |
} | |
} | |
try(if(sum(PRINT_ME)==(length(ESTIMATE_STD_ERROR)*(length(ESTIMATE_STD_ERROR)+1)/2)){ | |
cat("\n") | |
cat("Your values may be perfectly correlated, your t-values have been set arbitrarily large as they are approaching infinity") | |
}, silent=TRUE) | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment