KamilSJaron/papanova.R

## papanova.R
# written by Kamil S. Jaron based on a 'lab script'
# proudly released to public domain

# args :
# model_formula - formula just like specified in linear models
# input_data - a data frame containing all the data in the formula
# nrep - number of replicates (permutations) that will be performed
# plot_name - if speficied, scripre will generate a pdf with histograms of F statistics calculated with permutated response variable and the original anova F statistics (one per variable)
# number of cores used for computation - default is number of processors - 1; SPECIFY THIS PARAMETER ON BIG SERVERS;

## EXAMPLE
#
# # load the papanova function
# source('papanova.R')
#
# # download some dataset, a nice description can be found here: https://scikit-learn.org/stable/datasets/index.html#wine-recognition-dataset
# library(data.table)
# wines <- fread('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data')
# wines <- as.data.frame(wines)
#
# colnames(wines)[c(2,3,5,6)] <- c('alcohol', 'malic_accids', 'magnesium', 'phenols')
#
# # testing how amount of alcohol (V2) relates to
# # malic accid (V3) and iteraction of magnesium and phenols (just to have at least one interation)
# papanova(alcohol ~ malic_accids + magnesium * phenols, wines, 1000, 'wine_plot.pdf')
#
# # generates plot and 4 p-values corresponding to significance of variables
# # "malic_accids" "magnesium" "phenols" "magnesium:phenols"
# # or more generally
# my_formula <- alcohol ~ malic_accids + magnesium * phenols
# attr(terms(my_formula), 'term.labels')

require(parallel)

papanova <- function(model_formula, input_data, nprep = 10, plot_name = NA, no_cores = NA){
    if(is.na(no_cores)){
        no_cores <- detectCores() - 1
    }
    cl <- makeCluster(no_cores, type="FORK")

    # compute reference anova ( x == 1)
    # amd compute nprep - 1 replicates ( x != 1) of anova with permutation of response variable
    # premutations is a matrix (rows are variables, columns are replicates)
    observations <- nrow(input_data)
    premutations <- parSapply(cl, 1:nprep,
                        function(x){
                            if(x == 1){
                                c(anova(lm(model_formula, input_data))$F)
                            } else {
                                response <- all.vars(model_formula)[1]
                                alt_input_data <- input_data
                                alt_input_data[,response] <- sample(input_data[,response])
                                c(anova(lm(model_formula, alt_input_data))$F)
                            }
                        }
                    )
    stopCluster(cl)

    # calculate p-values of individual variables
    num_of_variables <- nrow(premutations) - 1
    pval <- sapply(1:num_of_variables, function(j){ sum(premutations[j,] >= premutations[j,1]) / nprep } )

    # in filename is specified, plot a picture
    if(!is.na(plot_name)){
        dependent_variables <- attr(terms(model_formula), 'term.labels')
        pdf(plot_name)
            for ( var in 1:num_of_variables){
                hist(premutations[var,],
                     xlab="F statistics", main = dependent_variables[var])
                abline(v = premutations[var,1], lwd=2)
            }
        dev.off()
    }
    return(pval)
}
	# written by Kamil S. Jaron based on a 'lab script'
	# proudly released to public domain

	# args :
	# model_formula - formula just like specified in linear models
	# input_data - a data frame containing all the data in the formula
	# nrep - number of replicates (permutations) that will be performed
	# plot_name - if speficied, scripre will generate a pdf with histograms of F statistics calculated with permutated response variable and the original anova F statistics (one per variable)
	# number of cores used for computation - default is number of processors - 1; SPECIFY THIS PARAMETER ON BIG SERVERS;

	## EXAMPLE
	#
	# # load the papanova function
	# source('papanova.R')
	#
	# # download some dataset, a nice description can be found here: https://scikit-learn.org/stable/datasets/index.html#wine-recognition-dataset
	# library(data.table)
	# wines <- fread('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data')
	# wines <- as.data.frame(wines)
	#
	# colnames(wines)[c(2,3,5,6)] <- c('alcohol', 'malic_accids', 'magnesium', 'phenols')
	#
	# # testing how amount of alcohol (V2) relates to
	# # malic accid (V3) and iteraction of magnesium and phenols (just to have at least one interation)
	# papanova(alcohol ~ malic_accids + magnesium * phenols, wines, 1000, 'wine_plot.pdf')
	#
	# # generates plot and 4 p-values corresponding to significance of variables
	# # "malic_accids" "magnesium" "phenols" "magnesium:phenols"
	# # or more generally
	# my_formula <- alcohol ~ malic_accids + magnesium * phenols
	# attr(terms(my_formula), 'term.labels')

	require(parallel)

	papanova <- function(model_formula, input_data, nprep = 10, plot_name = NA, no_cores = NA){
	if(is.na(no_cores)){
	no_cores <- detectCores() - 1
	}
	cl <- makeCluster(no_cores, type="FORK")

	# compute reference anova ( x == 1)
	# amd compute nprep - 1 replicates ( x != 1) of anova with permutation of response variable
	# premutations is a matrix (rows are variables, columns are replicates)
	observations <- nrow(input_data)
	premutations <- parSapply(cl, 1:nprep,
	function(x){
	if(x == 1){
	c(anova(lm(model_formula, input_data))$F)
	} else {
	response <- all.vars(model_formula)[1]
	alt_input_data <- input_data
	alt_input_data[,response] <- sample(input_data[,response])
	c(anova(lm(model_formula, alt_input_data))$F)
	}
	}
	)
	stopCluster(cl)

	# calculate p-values of individual variables
	num_of_variables <- nrow(premutations) - 1
	pval <- sapply(1:num_of_variables, function(j){ sum(premutations[j,] >= premutations[j,1]) / nprep } )

	# in filename is specified, plot a picture
	if(!is.na(plot_name)){
	dependent_variables <- attr(terms(model_formula), 'term.labels')
	pdf(plot_name)
	for ( var in 1:num_of_variables){
	hist(premutations[var,],
	xlab="F statistics", main = dependent_variables[var])
	abline(v = premutations[var,1], lwd=2)
	}
	dev.off()
	}
	return(pval)
	}