Last active
July 3, 2019 14:42
-
-
Save KamilSJaron/358c997698b67486be47d4e8eef2921d to your computer and use it in GitHub Desktop.
PArallel Permutation ANOVA
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# written by Kamil S. Jaron based on a 'lab script' | |
# proudly released to public domain | |
# args : | |
# model_formula - formula just like specified in linear models | |
# input_data - a data frame containing all the data in the formula | |
# nrep - number of replicates (permutations) that will be performed | |
# plot_name - if speficied, scripre will generate a pdf with histograms of F statistics calculated with permutated response variable and the original anova F statistics (one per variable) | |
# number of cores used for computation - default is number of processors - 1; SPECIFY THIS PARAMETER ON BIG SERVERS; | |
## EXAMPLE | |
# | |
# # load the papanova function | |
# source('papanova.R') | |
# | |
# # download some dataset, a nice description can be found here: https://scikit-learn.org/stable/datasets/index.html#wine-recognition-dataset | |
# library(data.table) | |
# wines <- fread('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data') | |
# wines <- as.data.frame(wines) | |
# | |
# colnames(wines)[c(2,3,5,6)] <- c('alcohol', 'malic_accids', 'magnesium', 'phenols') | |
# | |
# # testing how amount of alcohol (V2) relates to | |
# # malic accid (V3) and iteraction of magnesium and phenols (just to have at least one interation) | |
# papanova(alcohol ~ malic_accids + magnesium * phenols, wines, 1000, 'wine_plot.pdf') | |
# | |
# # generates plot and 4 p-values corresponding to significance of variables | |
# # "malic_accids" "magnesium" "phenols" "magnesium:phenols" | |
# # or more generally | |
# my_formula <- alcohol ~ malic_accids + magnesium * phenols | |
# attr(terms(my_formula), 'term.labels') | |
require(parallel) | |
papanova <- function(model_formula, input_data, nprep = 10, plot_name = NA, no_cores = NA){ | |
if(is.na(no_cores)){ | |
no_cores <- detectCores() - 1 | |
} | |
cl <- makeCluster(no_cores, type="FORK") | |
# compute reference anova ( x == 1) | |
# amd compute nprep - 1 replicates ( x != 1) of anova with permutation of response variable | |
# premutations is a matrix (rows are variables, columns are replicates) | |
observations <- nrow(input_data) | |
premutations <- parSapply(cl, 1:nprep, | |
function(x){ | |
if(x == 1){ | |
c(anova(lm(model_formula, input_data))$F) | |
} else { | |
response <- all.vars(model_formula)[1] | |
alt_input_data <- input_data | |
alt_input_data[,response] <- sample(input_data[,response]) | |
c(anova(lm(model_formula, alt_input_data))$F) | |
} | |
} | |
) | |
stopCluster(cl) | |
# calculate p-values of individual variables | |
num_of_variables <- nrow(premutations) - 1 | |
pval <- sapply(1:num_of_variables, function(j){ sum(premutations[j,] >= premutations[j,1]) / nprep } ) | |
# in filename is specified, plot a picture | |
if(!is.na(plot_name)){ | |
dependent_variables <- attr(terms(model_formula), 'term.labels') | |
pdf(plot_name) | |
for ( var in 1:num_of_variables){ | |
hist(premutations[var,], | |
xlab="F statistics", main = dependent_variables[var]) | |
abline(v = premutations[var,1], lwd=2) | |
} | |
dev.off() | |
} | |
return(pval) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment