Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@KamilSJaron
Last active July 3, 2019 14:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save KamilSJaron/358c997698b67486be47d4e8eef2921d to your computer and use it in GitHub Desktop.
Save KamilSJaron/358c997698b67486be47d4e8eef2921d to your computer and use it in GitHub Desktop.
PArallel Permutation ANOVA
# written by Kamil S. Jaron based on a 'lab script'
# proudly released to public domain
# args :
# model_formula - formula just like specified in linear models
# input_data - a data frame containing all the data in the formula
# nrep - number of replicates (permutations) that will be performed
# plot_name - if speficied, scripre will generate a pdf with histograms of F statistics calculated with permutated response variable and the original anova F statistics (one per variable)
# number of cores used for computation - default is number of processors - 1; SPECIFY THIS PARAMETER ON BIG SERVERS;
## EXAMPLE
#
# # load the papanova function
# source('papanova.R')
#
# # download some dataset, a nice description can be found here: https://scikit-learn.org/stable/datasets/index.html#wine-recognition-dataset
# library(data.table)
# wines <- fread('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data')
# wines <- as.data.frame(wines)
#
# colnames(wines)[c(2,3,5,6)] <- c('alcohol', 'malic_accids', 'magnesium', 'phenols')
#
# # testing how amount of alcohol (V2) relates to
# # malic accid (V3) and iteraction of magnesium and phenols (just to have at least one interation)
# papanova(alcohol ~ malic_accids + magnesium * phenols, wines, 1000, 'wine_plot.pdf')
#
# # generates plot and 4 p-values corresponding to significance of variables
# # "malic_accids" "magnesium" "phenols" "magnesium:phenols"
# # or more generally
# my_formula <- alcohol ~ malic_accids + magnesium * phenols
# attr(terms(my_formula), 'term.labels')
require(parallel)
papanova <- function(model_formula, input_data, nprep = 10, plot_name = NA, no_cores = NA){
if(is.na(no_cores)){
no_cores <- detectCores() - 1
}
cl <- makeCluster(no_cores, type="FORK")
# compute reference anova ( x == 1)
# amd compute nprep - 1 replicates ( x != 1) of anova with permutation of response variable
# premutations is a matrix (rows are variables, columns are replicates)
observations <- nrow(input_data)
premutations <- parSapply(cl, 1:nprep,
function(x){
if(x == 1){
c(anova(lm(model_formula, input_data))$F)
} else {
response <- all.vars(model_formula)[1]
alt_input_data <- input_data
alt_input_data[,response] <- sample(input_data[,response])
c(anova(lm(model_formula, alt_input_data))$F)
}
}
)
stopCluster(cl)
# calculate p-values of individual variables
num_of_variables <- nrow(premutations) - 1
pval <- sapply(1:num_of_variables, function(j){ sum(premutations[j,] >= premutations[j,1]) / nprep } )
# in filename is specified, plot a picture
if(!is.na(plot_name)){
dependent_variables <- attr(terms(model_formula), 'term.labels')
pdf(plot_name)
for ( var in 1:num_of_variables){
hist(premutations[var,],
xlab="F statistics", main = dependent_variables[var])
abline(v = premutations[var,1], lwd=2)
}
dev.off()
}
return(pval)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment