Created
August 25, 2015 09:57
-
-
Save boyanangelov/bce390e1d3ce0698e50c to your computer and use it in GitHub Desktop.
Statistical inference function
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function(y, x = NULL, | |
est = c("mean", "median", "proportion"), | |
success = NULL, order = NULL, | |
method = c("theoretical","simulation"), | |
type = c("ci","ht"), | |
alternative = c("less","greater","twosided"), | |
null = NULL, | |
boot_method = c("perc","se"), | |
conflevel = 0.95, siglevel = 0.05, | |
nsim = 10000, simdist = FALSE, seed = NULL, | |
sum_stats = TRUE, eda_plot = TRUE, inf_plot = TRUE, inf_lines = TRUE) { | |
# y: response variable, can be numerical or categorical | |
# x: explanatory variable, categorical (optional) | |
# est: parameter to estimate: mean, median, or proportion | |
# success: which level of the categorical variable to call "success", i.e. do inference on | |
# order: when x is given, order of levels of x in which to subtract parameters | |
# method: CLT based (theoretical) or simulation based (randomization/bootstrapping) | |
# type: ci (confidence interval) or ht (hypothesis test) | |
# alternative: direction of the alternative hypothesis | |
# null: null value for a hypothesis test | |
# boot_method: percentile (perc) or standard error (se) | |
# conflevel: confidence level, value between 0 and 1 | |
# siglevel: significance level, value between 0 and 1 (used only for ANOVA to determine if posttests are necessary) | |
# nsim: number of simulations | |
# simdist: TRUE/FALSE - return the simulation distribution | |
# seed: NULL - set.seed() | |
# sum_stats: TRUE/FALSE - print summary stats | |
# eda_plot: TRUE/FALSE - print EDA plot | |
# inf_plot: TRUE/FALSE - print inference plot | |
# inf_lines: TRUE/FALSE - print lines on the inference plot for ci bounds or p-value | |
# set mirror | |
options(repos=structure(c(CRAN="http://cran.rstudio.com"))) | |
# load packages, if needed | |
#if (!("lmPerm" %in% names(installed.packages()[,"Package"]))) {install.packages("lmPerm")} | |
#suppressMessages(library(lmPerm, quietly = TRUE)) | |
if (!("openintro" %in% names(installed.packages()[,"Package"]))) {install.packages("openintro")} | |
suppressMessages(library(openintro, quietly = TRUE)) | |
if (!("BHH2" %in% names(installed.packages()[,"Package"]))) {install.packages("BHH2")} | |
suppressMessages(library(BHH2, quietly = TRUE)) | |
# source helper functions | |
source("http://d396qusza40orc.cloudfront.net/statistics/lab_resources/chiTail.R") | |
source("http://d396qusza40orc.cloudfront.net/statistics/lab_resources/FTail.R") | |
source("http://d396qusza40orc.cloudfront.net/statistics/lab_resources/normTail.R") | |
# names for plot labels | |
y_name = deparse(substitute(y)) | |
x_name = deparse(substitute(x)) | |
# set seed | |
if(!is.null(seed)){set.seed(seed)} | |
# plotting settings | |
if(eda_plot == TRUE & inf_plot == TRUE) {par(mfrow=c(1,2), mar = c(4,4,0.5,0.5))} | |
if(any(c(inf_plot,eda_plot) == FALSE)) {par(mfrow=c(1,1), mar = c(4,4,0.5,0.5))} | |
# error: weird y | |
if (length(y) == 1) {stop("Sample size of y is 1.", call. = FALSE)} | |
# handling of NAs: drop NAs, and if x is given, use pairwise complete | |
if (is.null(x)) {if (sum(is.na(y)) > 0) {y = y[!is.na(y)]}} | |
if (!is.null(x)) { | |
if (sum(is.na(y)) > 0 | sum(is.na(x)) > 0) { | |
y.temp = y[!is.na(y) & !is.na(x)] | |
x.temp = x[!is.na(y) & !is.na(x)] | |
y = y.temp | |
x = x.temp | |
} | |
} | |
# error: y or x is character, make factor | |
if (is.character(y)) {y = as.factor(y)} | |
if (is.character(x)) {x = as.factor(x)} | |
if (is.logical(y)) {y = as.factor(y)} | |
if (is.logical(x)) {x = as.factor(x)} | |
# error: variables not of same length | |
if (!is.null(x)) { | |
if (length(y) != length(x)) {stop("The two variables must be of same length.", call. = FALSE)} | |
} | |
# set variable type for y: numerical or categorical | |
y_type = "categorical" | |
if (is.numeric(y)) {y_type = "numerical"} | |
# set variable type for x: categorical, numerical (unused), or only1var | |
if (!is.null(x)) { | |
x_type = "categorical" | |
if (is.numeric(x)) {x_type = "numerical"} | |
} | |
if (is.null(x)) {x_type = "only1var"} | |
# warning: explanatory variable numerical, convert to categorical | |
if(x_type == "numerical"){ | |
x = as.factor(x) | |
x_type = "categorical" | |
warning("Explanatory variable was numerical, it has been converted to categorical. In order to avoid this warning, first convert your explanatory variable to a categorical variable using the as.factor() function.", call. = FALSE) | |
} | |
# error: explanatory variable only has one level | |
if(x_type == "categorical"){ | |
if(length(levels(x)) == 1){ | |
stop("Explanatory variable has only one level, should have at least two levels.", call. = FALSE) | |
} | |
} | |
# error: response variable is categorical, but only has one level | |
if(y_type == "categorical"){ | |
if(length(levels(y)) == 1){ | |
stop("Response variable has only one level, should have at least two levels.", call. = FALSE) | |
} | |
} | |
# print variable types | |
if (x_type == "only1Var") {cat(paste("One", y_type, "variable", "\n"))} | |
if (x_type == "categorical") {cat(paste("Response variable: ", y_type, ", Explanatory variable: ", x_type, "\n", sep = ""))} | |
# set number of levels | |
x_levels = 0 # numerical variable | |
y_levels = 0 # numerical variable | |
if (x_type == "categorical") {x_levels = length(levels(x))} | |
if (y_type == "categorical") {y_levels = length(levels(y))} | |
# error: missing type, method, est | |
if (length(type) > 1) {stop("Missing type: ci (confidence interval) or ht (hypothesis test).", call. = FALSE)} | |
if (length(method) > 1) {stop("Missing method: theoretical or simulation.", call. = FALSE)} | |
if (length(est) > 1) {stop("Missing estimate: mean, median, or proportion.", call. = FALSE)} | |
# error: method isn't theoretical or simulation | |
method_list = c("theoretical", "simulation") | |
method = tolower(gsub("\\s","", method)) | |
which_method = pmatch(method, method_list) | |
if(is.na(which_method)){stop("Method should be 'theoretical' or 'simulation'.", call. = FALSE)} | |
method = method_list[which_method] | |
# error: type isn't ci or ht | |
type_list = c("ci", "ht") | |
type = tolower(gsub("\\s","", type)) | |
which_type = pmatch(type, type_list) | |
if(is.na(which_type)){stop("Type should be 'ci' or 'ht'.", call. = FALSE)} | |
type = type_list[which_type] | |
# error: missing boot_method | |
if (method == "simulation" & type == "ci" & length(boot_method) > 1){ | |
stop("Missing boot_method: perc (percentile method) or se (standard error method).", call. = FALSE) | |
} | |
# error: boot_method isn't perc or se | |
if (method == "simulation" & type == "ci"){ | |
boot_method_list = c("perc", "se") | |
boot_method = tolower(gsub("\\s","", boot_method)) | |
which_boot_method = pmatch(boot_method, boot_method_list) | |
if(is.na(which_boot_method)){stop("Bootstrap type should be 'perc' or 'se'.", call. = FALSE)} | |
boot_method = boot_method_list[which_boot_method] | |
} | |
# error: alternative isn't less, greater, or twosided | |
if(type == "ht"){ | |
alternative_list = c("less","greater","twosided","notequal") | |
alternative = tolower(gsub("\\s","", alternative)) | |
which_alternative = pmatch(alternative, alternative_list) | |
if((length(which_alternative) == 1) & any(is.na(which_alternative))){ | |
stop("Alternative should be 'less', 'greater' or 'twosided'.", call. = FALSE) | |
} | |
if(any(which_alternative == 4)) {which_alternative = 3} | |
alternative = alternative_list[which_alternative] | |
} | |
# error / warning: issues with null values | |
if (type == "ht" & x_type == "only1Var" & is.null(null)) {stop("Missing null value.", call. = FALSE)} | |
if (type == "ht" & (x_levels == 2 & y_levels <= 2) & is.null(null)) { | |
null = 0 | |
warning("Missing null value, set to 0.", call. = FALSE) | |
} | |
if (type == "ht" & (x_levels > 2 | y_levels > 2) & !is.null(null)) { | |
if(y_type == "numerical"){ | |
warning("Ignoring null value since it's undefined for ANOVA.", call. = FALSE) | |
} | |
if(y_type == "categorical"){ | |
warning("Ignoring null value since it's undefined for chi-square test.", call. = FALSE) | |
} | |
} | |
# error / warning: issues with alternative | |
if (type == "ht" & length(alternative) > 1) { | |
if(x_levels <= 2 & y_levels <= 2) {stop("Missing alternative: less, greater, twosided.", call. = FALSE)} | |
if(x_levels > 2 | y_levels > 2) { | |
if(y_type == "numerical") { | |
alternative = "greater" | |
warning("Use alternative = 'greater' for ANOVA.", call. = FALSE) | |
} | |
if(y_type == "categorical") { | |
alternative = "greater" | |
warning("Use alternative = 'greater' for chi-square test.", call. = FALSE) | |
} | |
} | |
} | |
# error: categorical variables have more than two levels, but type is ci | |
if ((x_levels > 2 | y_levels > 2) & type == "ci") { | |
if(y_type == "numerical"){ | |
stop("Categorical variable has more than 2 levels, confidence interval is undefined, use ANOVA to test for a difference between means.", call. = FALSE) | |
} | |
if(y_type == "categorical"){ | |
stop("Categorical variable has more than 2 levels, confidence interval is not defined, use chi-square test of independence.", call. = FALSE) | |
} | |
} | |
if ((x_levels > 2 | y_levels > 2) & est == "median") { | |
if(y_type == "numerical"){ | |
stop("This function cannot be used to compare medians across more than 2 xs, use est = 'mean' for ANOVA.", call. = FALSE) | |
} | |
} | |
# error: estimate isn't mean, median, or proportion | |
if (est %in% c("mean", "median", "proportion") == FALSE) { | |
stop("Estimate should be 'mean', 'median', or 'proportion'.", call. = FALSE) | |
} | |
# error: wrong estimate | |
if (y_type == "numerical" & est == "proportion") { | |
stop("Response variable is numerical, sample statistic cannot be a proportion, choose either mean or median.", call. = FALSE) | |
} | |
if (y_type == "categorical" & (est == "mean" | est == "median")) { | |
stop("Response variable is categorical, sample statistic cannot be a mean or a median, use proportion.", call. = FALSE) | |
} | |
# error: x variable has more than two levels, but alternative is not defined properly (chi-square and ANOVA) | |
if (x_type == "categorical" & x_levels > 2 & length(alternative) == 1) { | |
if(alternative != "greater"){stop("Use alternative = 'greater' for ANOVA or chi-square test.", call. = FALSE)} | |
} | |
# errors about success | |
if ((y_type == "categorical" & x_levels == 2 & y_levels == 2) | (y_type == "categorical" & is.null(x))) { | |
# error: success not provided for categorical variable for 1 or 2 proportion ci or ht | |
if (is.null(success)) { | |
y_level_names = levels(y) | |
stop(paste("Response variable is categorical, specify which level to call success: ", y_level_names[1], " or ", y_level_names[2]), call. = FALSE) | |
} | |
# error: success provided is not a level of the categorical variable | |
if (success %in% levels(y) == FALSE) { | |
stop(paste(success,"is not a level of the success variable."), call. = FALSE) | |
} | |
} | |
# warning: success provided for numerical variable | |
if (y_type == "numerical" & !is.null(success)) { | |
warning("Ignoring success since y are numerical.\n", call. = FALSE) | |
} | |
# warning: confidence level greater than 1 | |
if (conflevel > 1) { | |
conflevel = conflevel / 100 | |
warning(paste("Confidence level converted to ", conflevel, ".", sep = ""), call. = FALSE) | |
} | |
# warning: significance level greater than 1 | |
if (siglevel > 1) { | |
siglevel = siglevel / 100 | |
warning(paste("Significance level converted to ", siglevel, ".", sep = ""), call. = FALSE) | |
} | |
# define sample size | |
n = length(y) | |
# define sign of hypothesis test, for one and two means, medians, and proportions | |
if (type == "ht") { | |
if (alternative == "less") {sign = "<"} | |
if (alternative == "greater") {sign = ">"} | |
if (alternative == "twosided") {sign = "!="} | |
} | |
# one variable | |
if (x_type == "only1var") { | |
# print what's going on | |
if(y_type == "numerical"){cat("Single", est, "\n")} | |
if(y_type == "categorical"){cat("Single", est, "-- success:", success, "\n")} | |
# set statistic: mean, median, or proportion | |
if (y_type == "numerical") {statistic = match.fun(est)} | |
if (y_type == "categorical") {statistic = function(x) {sum(x == success)/length(x)}} | |
actual = statistic(y) | |
if(sum_stats == TRUE){cat("Summary statistics: ")} | |
if(est == "mean"){ | |
if(eda_plot == TRUE){hist(y, main = "", cex.main = 0.75, xlab = y_name, col = COL[3,4], ylab = "")} | |
if(sum_stats == TRUE){cat(paste("mean =", round(actual,4), "; ","sd =", round(sd(y), 4), "; ", "n =", n), "\n")} | |
} | |
if(est == "median"){ | |
if(eda_plot == TRUE){ | |
boxplot(y, main = "", main = "", xlab = y_name, col = COL[3,4], axes = FALSE) | |
axis(2) | |
} | |
if(sum_stats == TRUE){cat(paste("median =", round(actual,4), "; ", "n =", n), "\n")} | |
} | |
if(est == "proportion"){ | |
if(eda_plot == TRUE){barplot(table(y), main = "", xlab = y_name, col = COL[3,4])} | |
if(sum_stats == TRUE){cat(paste("p_hat =", round(actual,4), "; ", "n =", n), "\n")} | |
} | |
# simulation | |
if (method == "simulation") { | |
sim = matrix(NA, nrow = n, ncol = nsim) | |
# bootstrap ci | |
if (type == "ci") { | |
for(i in 1:nsim) {sim[,i] = sample(y, n, replace = TRUE)} | |
if (y_type == "categorical") { | |
statistic = function(x) { | |
which_success = which(levels(y) == success) | |
sum(x == which_success)/length(x) | |
} | |
} | |
sim_dist = apply(sim, 2, statistic) | |
if(boot_method == "perc"){ | |
ci = quantile(sim_dist, c( (1 - conflevel)/2 , ((1 - conflevel)/2)+conflevel )) | |
} | |
if(boot_method == "se"){ | |
critvalue = qnorm((1 - conflevel)/2 + conflevel ) | |
boot_se = sd(sim_dist) | |
me = critvalue * boot_se | |
ci = actual + c(-1,1) * me | |
} | |
if(inf_plot == TRUE){ | |
#if(y_type == "categorical"){xlim = c(min(sim_dist)-sd(sim_dist), max(sim_dist)+sd(sim_dist))} | |
#if(y_type == "numerical"){xlim = c(min(sim_dist)*0.8, max(sim_dist)*1.2)} | |
xlim = c(min(sim_dist)-0.8*sd(sim_dist), max(sim_dist)+0.8*sd(sim_dist)) | |
if (nsim > 500) { | |
counts = hist(sim_dist, plot = FALSE)$counts | |
hist(sim_dist, xlab = "Bootstrap distribution", main = "", ylab = "", xlim = xlim, col = COL[1,2]) | |
if (inf_lines == TRUE & inf_plot == TRUE) { | |
for (i in 1:2) { | |
segments(ci[i], 0, ci[i], 0.8 * max(counts), col=COL[4], lwd=2) | |
text(round(ci[i],4), max(counts), pos=1, col=COL[4], round(ci[i],4)) | |
} | |
} | |
} | |
if (nsim <= 500) { | |
BHH2::dotPlot(sim_dist, xlim = xlim, pch=19, col=COL[1,2], cex = 0.8, axes = FALSE, xlab="Bootstrap distribution") | |
axis(1) | |
if(inf_lines == TRUE & inf_plot == TRUE){ | |
for (i in 1:2) { | |
segments(ci[i], 0, ci[i], 0.6, col=COL[4], lwd=2) | |
text(round(ci[i],4), 0.7, pos=1, col=COL[4], round(ci[i],4)) | |
} | |
} | |
} | |
} | |
cat(c("Bootstrap method:", ifelse(boot_method == "se", "Standard error; ", "Percentile\n"))) | |
#if(boot_method == "se") {cat(c("Boot. mean =", round(boot_mean, 4),"; "))} | |
if(boot_method == "se") {cat(c("Boot. SE =", round(boot_se, 4),"\n"))} | |
cat(c(conflevel*100, "% Bootstrap interval = (", round(ci[1],4), ",", round(ci[2],4), ")\n")) | |
} | |
# randomization test | |
if (type == "ht") { | |
if (y_type == "numerical") { | |
for(i in 1:nsim) {sim[,i] = sample(y, n, replace = TRUE)} | |
sim_dist_temp = apply(sim, 2, statistic) | |
if (est == "mean") { | |
# hypotheses | |
cat(paste("H0: mu =", null, "\n")) | |
cat(paste("HA: mu", sign, null, "\n")) | |
sim_dist = sim_dist_temp - (mean(sim_dist_temp) - null) | |
} | |
if (est == "median") { | |
cat(paste("H0: median =", null, "\n")) | |
cat(paste("HA: median", sign, null, "\n")) | |
sim_dist = sim_dist_temp - (median(sim_dist_temp) - null) | |
} | |
} | |
if (y_type == "categorical") { | |
cat(paste("H0: p =", null, "\n")) | |
cat(paste("HA: p", sign, null, "\n")) | |
sim_dist = rbinom(nsim, n, prob = null) / n | |
} | |
smaller_tail = round(min(c(mean(sim_dist <= actual), mean(sim_dist >= actual))), 4) | |
if(inf_plot == TRUE){ | |
#if(y_type == "categorical"){xlim = c(min(sim_dist)-sd(sim_dist), max(sim_dist)+sd(sim_dist))} | |
#if(y_type == "numerical"){xlim = c(min(sim_dist)*0.8, max(sim_dist)*1.2)} | |
xlim = c(min(sim_dist)-0.8*sd(sim_dist), max(sim_dist)+0.8*sd(sim_dist)) | |
if (nsim > 500) { | |
counts = hist(sim_dist, plot = FALSE)$counts | |
hist(sim_dist, xlab = "Randomization distribution", main = "", ylab = "", xlim = xlim, col = COL[1,2]) | |
} | |
if (nsim <= 500) { | |
BHH2::dotPlot(sim_dist, xlim = xlim, pch=19, col=COL[1,2], cex = 0.8, axes = FALSE, xlab="Randomization distribution") | |
axis(1) | |
} | |
} | |
#alternative = match.arg(alternative) | |
if (alternative == "less") { | |
# counts = hist(sim_dist, plot = FALSE)$counts | |
if (actual < null) {cat(paste("p-value = ", smaller_tail,"\n"))} | |
if (actual > null) {cat(paste("p-value = ", 1 - smaller_tail,"\n"))} | |
if (inf_lines == TRUE & inf_plot == TRUE) { | |
if(nsim > 500) {lines(x = c(actual,actual), y = c(0,1.1*max(counts)), col=COL[4], lwd=2)} | |
if(nsim <= 500) {lines(x = c(actual,actual), y = c(0,0.8), col=COL[4], lwd=2)} | |
} | |
} | |
if (alternative == "greater") { | |
# counts = hist(sim_dist, plot = FALSE)$counts | |
if (actual < null) {cat(paste("p-value = ", 1 - smaller_tail,"\n"))} | |
if (actual > null) {cat(paste("p-value = ", smaller_tail,"\n"))} | |
if (inf_lines == TRUE & inf_plot == TRUE) { | |
if(nsim > 500) {lines(x = c(actual,actual), y = c(0,1.1*max(counts)), col=COL[4], lwd=2)} | |
if(nsim <= 500) {lines(x = c(actual,actual), y = c(0,0.8), col=COL[4], lwd=2)} | |
} | |
} | |
if (alternative == "twosided") { | |
# counts = hist(sim_dist, plot = FALSE)$counts | |
cat(paste("p-value = ", smaller_tail * 2,"\n")) | |
lines(x = c(actual,actual), y = c(0,1.1*max(counts)), col=COL[4], lwd=2) | |
if (actual >= null) { | |
temp = actual - null | |
if (inf_lines == TRUE & inf_plot == TRUE) { | |
if(nsim > 500) {lines(x = c(null - temp,null - temp), y = c(0,1.1*max(counts)), col=COL[4], lwd=2)} | |
if(nsim <= 500) {lines(x = c(null - temp,null - temp), y = c(0,0.8), col=COL[4], lwd=2)} | |
} | |
} | |
if (actual < null) { | |
temp = null - actual | |
if (inf_lines == TRUE & inf_plot == TRUE) { | |
if(nsim > 500) {lines(x = c(null + temp,null + temp), y = c(0,1.1*max(counts)), col=COL[4], lwd=2)} | |
if(nsim <= 500) {lines(x = c(null + temp,null + temp), y = c(0,0.8), col=COL[4], lwd=2)} | |
} | |
} | |
} | |
if (inf_lines == TRUE & inf_plot == TRUE) { | |
if(nsim > 500) {text(x = actual, y = 1.2*max(counts), paste("observed\n", round(actual,4)), col=COL[4], cex = 0.8)} | |
if(nsim <= 500) {text(x = actual, y = 0.9, paste("observed\n", round(actual,4)), col=COL[4], cex = 0.8)} | |
} | |
} | |
} | |
# theoretical | |
if (method == "theoretical") { | |
# confidence interval | |
if (type == "ci") { | |
if (y_type == "numerical") { | |
if (est == "median") {stop("Use simulation methods for inference for the median.", call. = FALSE)} | |
if (est == "mean") { | |
# calculate me and se | |
se = sd(y) / sqrt(n) | |
cat(paste("Standard error =", round(se, 4), "\n")) | |
if (n >= 30) {critvalue = qnorm( (1 - conflevel)/2 + conflevel )} | |
if (n < 30) {critvalue = qt( (1 - conflevel)/2 + conflevel , df = n - 1)} | |
} | |
} | |
if (y_type == "categorical") { | |
# check conditions | |
suc = round(n * actual, 2) | |
fail = round(n * (1 - actual), 2) | |
cat(paste("Check conditions: number of successes =", round(suc), ";", "number of failures =", round(fail)), "\n") | |
if (suc < 10 | fail < 10) { | |
stop("There aren't at least 10 successes and 10 failures, use simulation methods instead.", call. = FALSE) | |
} | |
# calculate me and se | |
se = sqrt(actual * (1-actual) / n) | |
cat(paste("Standard error =", round(se, 4), "\n")) | |
critvalue = qnorm( (1 - conflevel)/2 + conflevel ) | |
} | |
me = critvalue * se | |
ci = c(actual - me , actual + me) | |
cat(c(conflevel*100, "% Confidence interval = (", round(ci[1],4), ",", round(ci[2],4), ")\n")) | |
} | |
# hypothesis test | |
if (type == "ht") { | |
if (y_type == "numerical") { | |
if (est == "median") {stop("Use simulation methods for inference for the median.", call. = FALSE)} | |
if (est == "mean") { | |
# hypotheses | |
cat(paste("H0: mu =", null, "\n")) | |
cat(paste("HA: mu", sign, null, "\n")) | |
# calculate test statistic and p-value component | |
se = sd(y) / sqrt(n) | |
cat("Standard error =", round(se,4), "\n") | |
teststat = (actual - null)/se | |
if (n >= 30) { | |
cat(paste("Test statistic: Z =", round(teststat, 3),"\n")) | |
smaller_tail = round(min(pnorm(teststat), pnorm(teststat, lower.tail = FALSE)), 4) | |
} | |
if (n < 30) { | |
cat(paste("Test statistic: T =", round(teststat, 3),"\n")) | |
cat(paste("Degrees of freedom: ", n - 1, "\n")) | |
smaller_tail = round(min(pt(teststat, df = n - 1), pt(teststat, df = n - 1, lower.tail = FALSE)), 4) | |
} | |
} | |
} | |
if (y_type == "categorical") { | |
if (null < 0 | null > 1) { | |
stop("Null value should be a proportion between 0 and 1.", call. = FALSE) | |
} | |
# hypotheses | |
cat(paste("H0: p =", null, "\n")) | |
cat(paste("HA: p", sign, null, "\n")) | |
# check conditions | |
exp_suc = round(n * null, 2) | |
exp_fail = round(n * (1 - null), 2) | |
cat(paste("Check conditions: number of expected successes =", round(exp_suc), ";", "number of expected failures =", round(exp_fail)), "\n") | |
if (exp_suc < 10 | exp_fail < 10) { | |
stop("There aren't at least 10 expected successes and 10 expected failures, use simulation methods instead.", call. = FALSE) | |
} | |
# calculate test statistic and p-value | |
se = sqrt(null * (1 - null) / n) | |
cat("Standard error =", round(se,4), "\n") | |
teststat = (actual - null)/se | |
cat(paste("Test statistic: Z = ", round(teststat, 3),"\n")) | |
smaller_tail = round(min(pnorm(teststat), pnorm(teststat, lower.tail = FALSE)), 4) | |
} | |
# alternative = less | |
if (alternative == "less") { | |
if (actual < null) {cat(paste("p-value = ", smaller_tail,"\n"))} | |
if (actual > null) {cat(paste("p-value = ", 1 - smaller_tail,"\n"))} | |
normTail(L = teststat, axes = FALSE, col = COL[1,2]) | |
axis(1, at = c(-3, teststat, 0, 3), labels = c(NA, paste(round(actual,2)), paste(null), NA)) | |
} | |
# alternative = greater | |
if (alternative == "greater") { | |
if (actual < null) {cat(paste("p-value = ", 1 - smaller_tail,"\n"))} | |
if (actual > null) {cat(paste("p-value = ", smaller_tail,"\n"))} | |
if(inf_plot == TRUE){ | |
normTail(U = teststat, axes = FALSE, col = COL[1,2]) | |
axis(1, at = c(-3, 0, teststat, 3), labels = c(NA, paste(null), paste(round(actual,2)), NA)) | |
} | |
} | |
# alternative = twosided | |
if (alternative == "twosided") { | |
cat(paste("p-value = ", smaller_tail * 2,"\n")) | |
if (inf_plot == TRUE){ | |
if (actual < null) { | |
normTail(L = teststat, U = -1*teststat, axes = FALSE, col = COL[1,2]) | |
axis(1, at = c(-3, teststat, 0, -1*teststat, 3), labels = c(NA, paste(round(actual,2)), paste(null), paste(round(null + (null - actual), 2)), NA)) | |
} | |
if (actual > null) { | |
normTail(L = -1*teststat, U = teststat, axes = FALSE, col = COL[1,2]) | |
axis(1, at = c(-3, -1*teststat, 0, teststat, 3), labels = c(NA, paste(round(null - (actual - null), 2)), paste(null), paste(round(actual,2)), NA)) | |
} | |
} | |
} | |
} | |
} | |
} | |
# two variables | |
if (x_type == "categorical") { | |
# print what's going on | |
if (y_type == "numerical") { | |
if (x_levels == 2) {cat("Difference between two ", est, "s", "\n", sep = "")} | |
if (x_levels > 2) {cat("ANOVA\n")} | |
} | |
if (y_type == "categorical") { | |
if (x_levels == 2 & y_levels == 2) {cat("Difference between two ", est, "s -- success: ", success, "\n", sep = "")} | |
if (x_levels > 2 | y_levels > 2) {cat("Chi-square test of independence\n")} | |
} | |
# x variable with 2 levels | |
if (x_levels == 2 & (y_type == "numerical" | (y_type == "categorical" & y_levels == 2))) { | |
# order | |
if (is.null(order)) {order = levels(x)} | |
if (length(order) == 1 & !is.na(order[1])) { | |
stop("Order cannot be of length 1, list the order in which two levels of the xing variable should be subtracted.", call. = FALSE) | |
} | |
if (length(order) == 2) { | |
if ( all(order %in% levels(x)) == FALSE) { | |
str = paste(order[which(!(order %in% levels(x)))], collapse=" ") | |
stop(paste(str,"is not a level of the explanatory variable.",sep = " "), call. = FALSE) | |
} | |
if ((sum(levels(x) == order) == 0) == TRUE) { | |
x = relevel(x, ref = levels(as.factor(x))[2]) | |
} | |
if ((sum(levels(x) == order) == 0) == FALSE) { | |
x = x | |
} | |
} | |
# calculate sample sizes | |
n1 = sum(x==levels(as.factor(x))[1]) | |
n2 = sum(x==levels(as.factor(x))[2]) | |
# summary statistics and eda plots | |
if(sum_stats == TRUE){cat("Summary statistics:\n")} | |
if (est == "mean") { | |
if(sum_stats == TRUE){ | |
for(i in 1:x_levels) { | |
cat("n_", names(by(y, x, length))[i], " = ", by(y, x, length)[i], ", ", sep="") | |
cat("mean_", names(by(y, x, mean))[i], " = ", round(by(y, x, mean)[i],4), ", ", sep="") | |
cat("sd_", names(by(y, x, sd))[i], " = ", round(by(y, x, sd)[i],4), "\n", sep="") | |
} | |
} | |
if(eda_plot == TRUE){boxplot(y ~ x, xlab = x_name, ylab = y_name, main = "", col = COL[3,4])} | |
} | |
if (est == "median") { | |
if(sum_stats == TRUE){ | |
for(i in 1:x_levels) { | |
cat("n_", names(by(y, x, length))[i], " = ", by(y, x, length)[i], ", ", sep="") | |
cat("median_", names(by(y, x, median))[i], " = ", round(by(y, x, median)[i],4), ", ", sep="") | |
} | |
} | |
if(eda_plot == TRUE){boxplot(y ~ x, xlab = x_name, ylab = y_name, main = "", col = COL[3,4])} | |
} | |
if (est == "proportion") { | |
y_table = table(y, x) | |
if(sum_stats == TRUE){print(addmargins(y_table))} | |
if(eda_plot == TRUE){mosaicplot(table(x, y), xlab = x_name, ylab = y_name, main = "", col = COL[3,4])} | |
} | |
# set statistic: difference between means, medians, or proportions | |
if (y_type == "numerical") { | |
statistic <- function(y, x) { | |
if (est == "mean") { | |
stat = mean(y[x == levels(as.factor(x))[1]]) - mean(y[x == levels(as.factor(x))[2]]) | |
} | |
if (est == "median") { | |
stat = median(y[x == levels(as.factor(x))[1]]) - median(y[x == levels(as.factor(x))[2]]) | |
} | |
return(stat) | |
} | |
} | |
if (y_type == "categorical") { | |
statistic <- function(y, x) { | |
sum(y == success & x == levels(as.factor(x))[1])/n1 - sum(y == success & x == levels(as.factor(x))[2])/n2 | |
} | |
} | |
# calculate and print actual | |
actual = statistic(y, x) | |
cat("Observed difference between ", est, "s (", levels(x)[1], "-", levels(x)[2] ,") = ", round(actual,4), "\n", sep = "") | |
# save label | |
label = paste("Difference in sample ", est, "s", ", ", levels(as.factor(x))[1],"-",levels(as.factor(x))[2], sep = "") | |
# simulation | |
if (method == "simulation") { | |
n = length(y) | |
sim = matrix(NA, nrow = n, ncol = nsim) | |
# bootstrap ci | |
if (type == "ci") { | |
if (y_type == "numerical") {statistic = match.fun(est)} | |
if (y_type == "categorical") { | |
statistic = function(x) { | |
which_success = which(levels(y) == success) | |
sum(x == which_success)/length(x) | |
} | |
} | |
sim1 = matrix(NA, nrow = n1, ncol = nsim) | |
sim2 = matrix(NA, nrow = n2, ncol = nsim) | |
for(i in 1:nsim) {sim1[,i] = sample(y[x == order[1]], n1, replace = TRUE)} | |
for(i in 1:nsim) {sim2[,i] = sample(y[x == order[2]], n2, replace = TRUE)} | |
sim_dist1 = apply(sim1, 2, statistic) | |
sim_dist2 = apply(sim2, 2, statistic) | |
sim_dist = sim_dist1 - sim_dist2 | |
if(boot_method == "perc"){ | |
ci = quantile(sim_dist, c( (1 - conflevel)/2 , ((1 - conflevel)/2)+conflevel )) | |
} | |
if(boot_method == "se"){ | |
critvalue = qnorm( (1 - conflevel)/2 + conflevel ) | |
#boot_mean = mean(sim_dist) | |
boot_se = sd(sim_dist) | |
me = critvalue * boot_se | |
ci = actual + c(-1,1) * me | |
} | |
counts = table(sim_dist) | |
if(inf_plot == TRUE){ | |
#if(y_type == "categorical"){xlim = c(min(sim_dist)-sd(sim_dist), max(sim_dist)+sd(sim_dist))} | |
#if(y_type == "numerical"){xlim = c(min(sim_dist)*0.8, max(sim_dist)*1.2)} | |
xlim = c(min(sim_dist)-0.8*sd(sim_dist), max(sim_dist)+0.8*sd(sim_dist)) | |
if (nsim > 500) { | |
counts = hist(sim_dist, plot = FALSE)$counts | |
hist(sim_dist, xlab = "Bootstrap distribution", main = "", ylab = "", xlim = xlim, col = COL[1,2]) | |
if (inf_lines == TRUE & inf_plot == TRUE) { | |
for (i in 1:2) { | |
segments(ci[i], 0, ci[i], 0.8 * max(counts), col=COL[4], lwd=2) | |
text(round(ci[i],4), max(counts), pos=1, col=COL[4], round(ci[i],4)) | |
} | |
} | |
} | |
if (nsim <= 500) { | |
BHH2::dotPlot(sim_dist, xlim = xlim, pch=19, col=COL[1,2], cex = 0.8, axes = FALSE, xlab="Bootstrap distribution") | |
axis(1) | |
if(inf_lines == TRUE & inf_plot == TRUE){ | |
for (i in 1:2) { | |
segments(ci[i], 0, ci[i], 0.6, col=COL[4], lwd=2) | |
text(round(ci[i],4), 0.7, pos=1, col=COL[4], round(ci[i],4)) | |
} | |
} | |
} | |
} | |
cat(c("Bootstrap method:", ifelse(boot_method == "se", "Standard error; ", "Percentile\n"))) | |
#if(boot_method == "se") {cat(c("Boot. mean =", round(boot_mean, 4),"; "))} | |
if(boot_method == "se") {cat(c("Boot. SE =", round(boot_se, 4),"\n"))} | |
cat(c(conflevel*100, "% Bootstrap interval = (", round(ci[1],4), ",", round(ci[2],4), ")\n")) | |
} | |
# randomization test | |
if (type == "ht") { | |
# hypotheses | |
if (est == "mean") { | |
mu1 = paste("mu_", order[1], sep = "") | |
mu2 = paste("mu_", order[2], sep = "") | |
cat(paste("H0:", mu1 , "-", mu2, "=", null, "\n")) | |
cat(paste("HA:", mu1 , "-", mu2, sign, null, "\n")) | |
} | |
if (est == "median") { | |
med1 = paste("median_", order[1], sep = "") | |
med2 = paste("median_", order[2], sep = "") | |
cat(paste("H0:", med1 , "-", med2, "=", null, "\n")) | |
cat(paste("HA:", med1 , "-", med2, sign, null, "\n")) | |
} | |
if (est == "proportion") { | |
p1 = paste("p_", order[1], sep = "") | |
p2 = paste("p_", order[2], sep = "") | |
cat(paste("H0:", p1 , "-", p2, "=", null, "\n")) | |
cat(paste("HA:", p1 , "-", p2, sign, null, "\n")) | |
} | |
for(i in 1:nsim) {sim[,i] = sample(x, n, replace = FALSE)} | |
sim_dist = apply(sim, 2, statistic, y = y) | |
smaller_tail = round(min(c(mean(sim_dist <= actual), mean(sim_dist >= actual))), 4) | |
xmin = min(c(-1.1*abs(actual), sim_dist)) | |
xmax = max(c(1.1*actual, sim_dist)) | |
if (inf_plot == TRUE){ | |
if (nsim > 500) { | |
counts = hist(sim_dist, plot = FALSE)$counts | |
hist(sim_dist, main = "Randomization distribution", xlab = "", ylim = c(0, 1.3 * max(counts)), xlim = c(xmin,xmax), cex.main = 0.75, col = COL[1,2]) | |
} | |
if (nsim <= 500) { | |
if (y_type == "numerical") { | |
counts = BHH2::dotPlot(sim_dist, main = "Randomization distribution", xlab = "", pch = 20, cex = 0.8, xlim = c(xmin,xmax), cex.main = 0.75)$y | |
} | |
if (y_type == "categorical") { | |
counts = table(sim_dist) | |
plot(sim_dist, type = "n", ylim = c(0,max(counts)*1.3), axes = FALSE, xlim = c(0.9*min(sim_dist),1.1*max(sim_dist)), main = "Randomization distribution", xlab = "", ylab = "", cex.main = 0.75) | |
axis(1) | |
axis(2) | |
for(i in 1:length(sim_dist)) { | |
x <- sim_dist[i] | |
rec <- sum(sim_dist == x) | |
points(rep(x, rec), 1:rec, pch=20, cex=0.8) | |
} | |
} | |
} | |
} | |
#alternative = match.arg(alternative) | |
if (alternative == "less") { | |
if (actual < null) {cat(paste("p-value = ", smaller_tail,"\n"))} | |
if (actual > null) {cat(paste("p-value = ", 1 - smaller_tail,"\n"))} | |
if (inf_lines == TRUE & inf_plot == TRUE) {lines(x = c(actual,actual), y = c(0,1.1*max(counts)), col="#569BBD", lwd=2)} | |
} | |
if (alternative == "greater") { | |
if (actual < null) {cat(paste("p-value = ", 1 - smaller_tail,"\n"))} | |
if (actual > null) {cat(paste("p-value = ", smaller_tail,"\n"))} | |
if (inf_lines == TRUE & inf_plot == TRUE) {lines(x = c(actual,actual), y = c(0,1.1*max(counts)), col="#569BBD", lwd=2)} | |
} | |
if (alternative == "twosided") { | |
cat(paste("p-value = ", smaller_tail * 2,"\n")) | |
if (inf_lines == TRUE & inf_plot == TRUE) {lines(x = c(actual,actual), y = c(0,1.1*max(counts)), col="#569BBD", lwd=2)} | |
if (actual >= null) { | |
temp = actual - null | |
if (inf_lines == TRUE & inf_plot == TRUE) {lines(x = c(null - temp,null - temp), y = c(0,1.1*max(counts)), col = COL[1,2], lwd=2)} | |
} | |
if (actual < null) { | |
temp = null - actual | |
if (inf_lines == TRUE & inf_plot == TRUE) {lines(x = c(null + temp,null + temp), y = c(0,1.1*max(counts)), col = COL[1,2], lwd=2)} | |
} | |
} | |
if (inf_lines == TRUE & inf_plot == TRUE) {text(x = actual, y = 1.25*max(counts), paste("observed\n", round(actual,4)), col = COL[1,2], cex = 0.8)} | |
} | |
} | |
# theoretical | |
if (method == "theoretical") { | |
# confidence interval | |
if (type == "ci") { | |
if (y_type == "numerical") { | |
if (est == "median") {stop("Use simulation methods for inference for the median.", call. = FALSE) | |
} | |
if (est == "mean") { | |
# calculate se and critvalue | |
s1 = sd(y[x == levels(x)[1]]) | |
s2 = sd(y[x == levels(x)[2]]) | |
se = sqrt(s1^2/n1 + s2^2/n2) | |
cat("Standard error =", round(se,4), "\n") | |
if (n1 >= 30 & n2 >= 30) {critvalue = qnorm( (1 - conflevel)/2 + conflevel )} | |
if (n1 < 30 | n2 < 30) {critvalue = qt( (1 - conflevel)/2 + conflevel , df = min(n1 - 1, n2 - 1))} | |
} | |
} | |
if (y_type == "categorical") { | |
# check conditions | |
cat("Check conditions:\n") | |
suc1 = sum(y[x == levels(x)[1]] == success) | |
fail1 = sum(y[x == levels(x)[1]] != success) | |
cat(paste(" ", levels(x)[1], ": number of successes =", round(suc1), ";", "number of failures =", round(fail1)), "\n") | |
suc2 = sum(y[x == levels(x)[2]] == success) | |
fail2 = sum(y[x == levels(x)[2]] != success) | |
cat(paste(" ", levels(x)[2], ": number of successes =", round(suc2), ";", "number of failures =", round(fail2)), "\n") | |
if (suc1 < 10 | fail1 < 10 | suc2 < 10 | fail2 < 10) { | |
stop("There aren't at least 10 successes and 10 failures, use simulation methods instead.", call. = FALSE) | |
} | |
# calculate se and critvalue | |
p1 = suc1 / n1 | |
p2 = suc2 / n2 | |
se = sqrt(p1 * (1-p1)/n1 + p2 * (1-p2)/n2) | |
cat("Standard error =", round(se,4), "\n") | |
critvalue = qnorm( (1 - conflevel)/2 + conflevel ) | |
} | |
# calculate ci | |
me = critvalue * se | |
ci = c(actual - me , actual + me) | |
cat(c(conflevel*100, "% Confidence interval = (", round(ci[1],4), ",", round(ci[2],4), ")\n")) | |
} | |
# hypothesis test | |
if (type == "ht") { | |
if (y_type == "numerical") { | |
if (est == "median") {stop("Use simulation methods for inference for the median.", call. = FALSE) | |
} | |
if (est == "mean") { | |
# hypotheses | |
mu1 = paste("mu_", order[1], sep = "") | |
mu2 = paste("mu_", order[2], sep = "") | |
cat(paste("H0:", mu1 , "-", mu2, "=", null, "\n")) | |
cat(paste("HA:", mu1 , "-", mu2, sign, null, "\n")) | |
# calculate test statistic and p-value component | |
s1 = sd(y[x == levels(x)[1]]) | |
s2 = sd(y[x == levels(x)[2]]) | |
se = sqrt(s1^2/n1 + s2^2/n2) | |
cat("Standard error =", round(se,3), "\n") | |
teststat = (actual - null)/se | |
if (n1 >= 30 & n2 >= 30) { | |
cat(paste("Test statistic: Z = ", round(teststat, 3),"\n")) | |
smaller_tail = round(min(pnorm(teststat), pnorm(teststat, lower.tail = FALSE)), 4) | |
} | |
if (n1 < 30 | n2 < 30) { | |
cat(paste("Test statistic: T = ", round(teststat, 3),"\n")) | |
cat(paste("Degrees of freedom: ", min(n1 - 1, n2 - 1), "\n")) | |
smaller_tail = round(min(pt(teststat, df = min(n1 - 1, n2 - 1)), pt(teststat, df = min(n1 - 1, n2 - 1), lower.tail = FALSE)), 4) | |
} | |
} | |
} | |
if (y_type == "categorical") { | |
if (null <= -1 | null >= 1) { | |
stop("Null value should be a proportion between 0 and 1.", call. = FALSE) | |
} | |
# hypotheses | |
p1 = paste("p_", order[1], sep = "") | |
p2 = paste("p_", order[2], sep = "") | |
cat(paste("H0:", p1 , "-", p2, "=", null, "\n")) | |
cat(paste("HA:", p1 , "-", p2, sign, null, "\n")) | |
# calculate p_pool | |
suc1 = sum(y[x == levels(x)[1]] == success) | |
fail1 = sum(y[x == levels(x)[1]] != success) | |
suc2 = sum(y[x == levels(x)[2]] == success) | |
fail2 = sum(y[x == levels(x)[2]] != success) | |
p_pool = (suc1 + suc2)/(n1 + n2) | |
cat(paste("Pooled proportion =", round(p_pool, 4), "\n")) | |
# check conditions | |
cat("Check conditions:\n") | |
exp_suc1 = n1 * p_pool | |
exp_fail1 = n1 * (1 - p_pool) | |
cat(paste(" ", levels(x)[1], ": number of expected successes =", round(exp_suc1), ";", "number of expected failures =", round(exp_fail1)), "\n") | |
exp_suc2 = n2 * p_pool | |
exp_fail2 = n2 * (1 - p_pool) | |
cat(paste(" ", levels(x)[2], ": number of expected successes =", round(exp_suc2), ";", "number of expected failures =", round(exp_fail2)), "\n") | |
if (exp_suc1 < 10 | exp_fail1 < 10 | exp_suc2 < 10 | exp_fail2 < 10) { | |
stop("There aren't at least 10 expected successes and 10 expected failures, use simulation methods instead.", call. = FALSE) | |
} | |
# calculate test statistic and p-value | |
se = sqrt( p_pool * (1 - p_pool) / n1 + p_pool * (1 - p_pool) / n2 ) | |
cat("Standard error =", round(se,3), "\n") | |
teststat = (actual - null) / se | |
cat(paste("Test statistic: Z = ", round(teststat, 3),"\n")) | |
smaller_tail = round(min(pnorm(teststat), pnorm(teststat, lower.tail = FALSE)), 4) | |
} | |
# alternative = less | |
if (alternative == "less") { | |
if (actual < null) {cat(paste("p-value = ", smaller_tail,"\n"))} | |
if (actual > null) {cat(paste("p-value = ", 1 - smaller_tail,"\n"))} | |
if (inf_plot == TRUE){ | |
normTail(L = teststat, axes = FALSE, col = COL[1,2]) | |
axis(1, at = c(-3, teststat, 0, 3), labels = c(NA, paste(round(actual,2)), paste(null), NA)) | |
} | |
} | |
# alternative = greater | |
if (alternative == "greater") { | |
if (actual < null) {cat(paste("p-value = ", 1 - smaller_tail,"\n"))} | |
if (actual > null) {cat(paste("p-value = ", smaller_tail,"\n"))} | |
if (inf_plot == TRUE){ | |
normTail(U = teststat, axes = FALSE, col = COL[1,2]) | |
axis(1, at = c(-3, 0, teststat, 3), labels = c(NA, paste(null), paste(round(actual,2)), NA)) | |
} | |
} | |
# alternative = twosided | |
if (alternative == "twosided") { | |
cat(paste("p-value = ", smaller_tail * 2,"\n")) | |
if (inf_plot == TRUE){ | |
if (actual < null) { | |
normTail(L = teststat, U = -1*teststat, axes = FALSE, col = COL[1,2]) | |
axis(1, at = c(-3, teststat, 0, -1*teststat, 3), labels = c(NA, paste(round(actual,2)), paste(null), paste(round(null + (null - actual), 2)), NA)) | |
} | |
if (actual > null) { | |
normTail(L = -1*teststat, U = teststat, axes = FALSE, col = COL[1,2]) | |
axis(1, at = c(-3, -1*teststat, 0, teststat, 3), labels = c(NA, paste(round(null - (actual - null), 2)), paste(null), paste(round(actual,2)), NA)) | |
} | |
} | |
} | |
} | |
} | |
} | |
# x variable with >2 levels, ANOVA | |
if (x_levels > 2 & y_type == "numerical") { | |
# summary statistics | |
if(sum_stats == TRUE){cat("Summary statistics:\n")} | |
if(sum_stats == TRUE){ | |
for(i in 1:x_levels) { | |
cat("n_", names(by(y, x, length))[i], " = ", by(y, x, length)[i], ", ", sep="") | |
cat("mean_", names(by(y, x, mean))[i], " = ", round(by(y, x, mean)[i],4), ", ", sep="") | |
cat("sd_", names(by(y, x, sd))[i], " = ", round(by(y, x, sd)[i],4), "\n", sep="") | |
} | |
} | |
if(eda_plot == TRUE){boxplot(y ~ x, xlab = x_name, ylab = y_name, main = "", col = COL[3,4])} | |
# hypotheses | |
cat("H_0: All means are equal.\n") | |
cat("H_A: At least one mean is different.\n") | |
if(method == "theoretical"){ | |
# anova output | |
anova_output = anova(lm(y~x)) | |
print(anova_output, signif.stars=F) | |
# post-hoc pairwise tests, if ANOVA is significant | |
if (anova_output$"Pr(>F)"[1] < siglevel) { | |
cat("\nPairwise tests: ") | |
pairwise = pairwise.t.test(y, x, p.adj = "none") | |
cat(pairwise$method, "\n") | |
print(round(pairwise$p.value,4)) | |
} | |
# plot | |
F = anova_output[["F value"]][1] | |
df_n = anova_output$Df[1] | |
df_d = anova_output$Df[2] | |
if(inf_plot == TRUE) {FTail(F,df_n,df_d, col = COL[1])} | |
} | |
if(method == "simulation"){ | |
# anova output | |
#anova_output = anova(lmp(y~x)) | |
#print(anova_output, signif.stars=F) | |
stop("Simulation based ANOVA is not implemented in this function. If conditions are met, try ANOVA based on theoretical methods.", call. = FALSE) | |
} | |
} | |
# x variable with >2 levels, chi-square | |
if ((x_levels > 2 | y_levels > 2) & y_type == "categorical") { | |
# summary statistics | |
y_table = table(y, x) | |
if(sum_stats == TRUE){ | |
cat("\nSummary statistics:\n") | |
print(addmargins(y_table)) | |
cat("\n") | |
} | |
if(eda_plot == TRUE){mosaicplot(table(x, y), main = "", col = COL[3,4])} | |
# hypotheses | |
cat("H_0: Response and explanatory variable are independent.\n") | |
cat("H_A: Response and explanatory variable are dependent.\n") | |
if (method == "theoretical") { | |
chisquare_output = chisq.test(y_table, correct = FALSE) | |
# check conditions | |
cat("Check conditions: expected counts\n") | |
expected = round(chisquare_output$expected, 2) | |
print(expected) | |
if (any(expected < 5)) { | |
stop("Expected count for at least one cell is less than 5, use method = 'simulation'.", call. = FALSE) | |
} | |
# chi-square output | |
print(chisquare_output) | |
# plot | |
chi = chisquare_output$statistic | |
df = chisquare_output$parameter | |
if (inf_plot == TRUE){chiTail(chi,df)} | |
} | |
if(method == "simulation"){ | |
# chi-square output | |
chisquare_output = chisq.test(y_table, correct = FALSE, simulate.p.value = TRUE, B = nsim) | |
print(chisquare_output) | |
} | |
} | |
} | |
# return simdist | |
if (simdist == TRUE) {return(sim_dist)} | |
# reset plotting window | |
par(mfrow = c(1,1), mar = c(5,4,4,2)+0.1) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment