Skip to content

Instantly share code, notes, and snippets.

@JoachimGoedhart
Last active October 14, 2018 18:59
Show Gist options
  • Save JoachimGoedhart/68c8507452cb607e14e45aa27c17bee7 to your computer and use it in GitHub Desktop.
Save JoachimGoedhart/68c8507452cb607e14e45aa27c17bee7 to your computer and use it in GitHub Desktop.
R-script that calculates a p-value using a randomization test to generate the H0 distribution
#Function that returns p-values from randomization/permutation test
require(dplyr)
require(tidyr)
#Set number of randomizations
nsteps <- 1000
#Read the dataframe - the example data used below is available at: https://github.com/JoachimGoedhart/PlotsOfData
df <- read.csv("Data_wide_example.csv")
#Make it tidy
df_tidy <- df %>% gather(Condition, Value)
#Median and mean values as reference/observed values
df_obs_stats <- df_tidy %>% group_by(Condition) %>% summarise(mean=mean(Value, na.rm=TRUE), median=median(Value, na.rm=TRUE))
#Get the 'Control' condition from the shiny interface (disabled when running as independent script)
# control_condition <- as.character(input$zero)
#Get a list with control values, assumes that there is a condition named "Control"
Controls <- df_tidy %>% filter(Condition=="Control", !is.na(Value)) %>% select("Value") %>% unlist(use.names = FALSE)
#generate a df with differences from the observations
df_obs_stats$mean <- df_obs_stats$mean - mean(Controls)
df_obs_stats$median <- df_obs_stats$median - median(Controls)
#Determine number of observations in Control sample
number_controls <- length(Controls)
#Make a new dataframe with control values for each of the conditions
df_controls <- data.frame(Condition=rep(levels(factor(df_tidy$Condition)), each=number_controls), Value=Controls)
#Add the original data, generating (per condition) Control&Sample values in the column "Value".
df_combi <- bind_rows(df_controls, df_tidy) %>% filter(!is.na(Value))
df_new_stats <- data.frame()
#Perform the randomization nsteps number of times (typically 1,000x)
for (i in 1:nsteps) {
#Randomize the dataframe
df_permutated <- df_combi %>% group_by(Condition) %>% sample_frac()
#Determine the (new) control mean and (new) sample mean
df_control <- df_permutated %>% slice(1: number_controls) %>% summarise(new_control_mean=mean(Value), new_control_median=median(Value))
df_sample <- df_permutated %>% slice((number_controls+1):length(Value))%>% summarise(new_sample_mean=mean(Value), new_sample_median=median(Value))
df_diff <- full_join(df_control, df_sample,by="Condition")
df_new_stats <- bind_rows(df_new_stats, df_diff)
}
#Calculate the difference in mean and median (sample-control) for all the calculated new stats
df_all_diffs <- df_new_stats %>% mutate(new_diff_mean=new_sample_mean-new_control_mean,
new_diff_median=new_sample_median-new_control_median)
#Add the observed stats to stats from permutated df
df_all_diffs <- full_join(df_all_diffs, df_obs_stats, by="Condition")
#Determine the occurences where the permutated difference is more extreme than the observed difference (use absolute values for both for two-tailed test)
df_p_mean <- df_all_diffs %>% group_by(Condition) %>% mutate(count = if_else(abs(new_diff_mean) >= abs(mean), 1, 0)) %>% summarise(p_mean=mean(count))
df_p_median <- df_all_diffs %>% group_by(Condition) %>% mutate(count1 = if_else(abs(new_diff_median) >= abs(median), 1, 0)) %>% summarise(p_median=mean(count1))
def_p <- as.data.frame(full_join(df_p_mean, df_p_median,by="Condition"))
#Replace p-values of zero with <0.001 (0 is theoretically not possible, but an upper bound can estimated, which is 1/nsteps = 1/1000)
def_p[def_p==0]<-"<0.001"
def_p
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment