Skip to content

Instantly share code, notes, and snippets.

@Sarkom
Last active August 29, 2015 14:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Sarkom/95e3c7ecfcdbddeec959 to your computer and use it in GitHub Desktop.
Save Sarkom/95e3c7ecfcdbddeec959 to your computer and use it in GitHub Desktop.
Permutation Test R code - performs permutation testing while trying to ensure that we sample from the space of all permutations without replacement.
# permutation_test.R
#
# A function for performing permutation testing that ensures that we sample
# from the space of all permutations without replacement.
#
# Author: Saranga Komanduri (skomanduri@gmail.com)
#
###############################################################################
library(hash)
library(gmp)
library(digest)
library(sn) # For producing skewed distributions
permutation.test <- function(sample1, sample2, metric, I = 1000) {
# Perform a permutation test by sampling without replacement from the space of permutations
start = Sys.time()
# Get test statistic
Tobs = metric(sample1) - metric(sample2)
# Pool samples
n1 = length(sample1)
n2 = length(sample2)
n = n1 + n2
allsamples = c(sample1, sample2)
# Do a bunch of iterations
Tcalc = rep(0,I)
combins = hash()
for( i in 1:I ) {
# Generate random combinations
choicevector = c(rep(0,n1),rep(1,n2))
samplechoice = sample(choicevector,n)
# Convert the samplechoice vector of 1s and 0s to a large binary number
key = as.bigz(paste("0b",paste(samplechoice,collapse=""),sep=""))
# Check if it is in the hash table - key length is max of 256 for the hash table
if( n < 1200 ) # 256 = log_32(2^1280)
{
key = as.character(key, b = 32)
while( !is.null( combins[[ key ]] ) ) {
cat("Collision! Resampling...\n")
samplechoice = sample(choicevector,n)
key = paste(samplechoice,collapse="")
key = as.character(key, b = 32)
}
combins[[ key ]] = 1
}
else
{
key = digest(as.character(key,b=32), algo = "sha1")
while( !is.null( combins[[ key ]] ) ) {
cat("Collision! Resampling...\n")
samplechoice = sample(choicevector,n)
key = paste(samplechoice,collapse="")
key = digest(as.character(key,b=32), algo = "sha1")
}
combins[[ key ]] = 1
}
newsample1 = allsamples[samplechoice == 0]
newsample2 = allsamples[samplechoice == 1]
if ((length(newsample1) != length(sample1)) |
(length(newsample2) != length(sample2))) {
stop("Error in resampling!")
}
Tcalc[i] = metric(newsample1) - metric(newsample2)
}
# Determine p-value
p = sum(abs(Tcalc) >= abs(Tobs)) / I
end = Sys.time()
# print(difftime(end, start))
return(list(pvalue = p,
p.value = p,
Tobs = Tobs,
numiter = I,
time = difftime(end, start),
Tcalc = Tcalc))
}
# Test 1: N(0,1) vs N(100,1)
x = rnorm(1000, 0, 1)
y = rnorm(1000, 100, 1)
cat("N(0,1) vs N(100,1) -- compare normal distributions with same variance and different means:\n")
cat("With var:")
replicate(5, permutation.test(x, y, var)$pvalue)
cat("With sd:")
replicate(5, permutation.test(x, y, sd)$pvalue)
# Test 2: N(0,1) vs N(100,1.5) (SHOULD BE low p-value)
x = rnorm(1000, 0, 1)
y = rnorm(1000, 100, 1.5)
cat("N(0,1) vs N(100,1.5) -- compare normal distributions with different variance and different means:\n")
cat("Resulting p-value should be low!\n")
cat("With var:")
replicate(5, permutation.test(x, y, var)$pvalue)
cat("With sd:")
replicate(5, permutation.test(x, y, sd)$pvalue)
cat("\n\n")
# Skew-normal (SN)
# Test 3: SN(0, 1, 10) vs SN(100, 1, 10)
x = rsn(1000, 0, 1, 10)
y = rsn(1000, 100, 1, 10)
cat("SN(0, 1, 10) vs SN(100, 1, 10) -- compare skew-normal distributions with same variance and different means:\n")
cat("With var:")
replicate(5, permutation.test(x, y, var)$pvalue)
cat("With sd:")
replicate(5, permutation.test(x, y, sd)$pvalue)
# Test 4: SN(0, 1, 10) vs SN(100, 1, -10)
y = rsn(1000, 100, 1, -10)
cat("SN(0, 1, 10) vs SN(100, 1, -10) -- compare skew-normal distributions with same variance, different means, and opposite slant parameters:\n")
cat("With var:")
replicate(5, permutation.test(x, y, var)$pvalue)
cat("With sd:")
replicate(5, permutation.test(x, y, sd)$pvalue)
# Test 5: SN(0, 1, 10) vs SN(100, 1.5, -10) (SHOULD BE low p-value)
x = rsn(1000, 0, 1, 10)
y = rsn(1000, 100, 1.5, -10)
cat("SN(0, 1, 10) vs SN(100, 1.5, -10) -- compare skew-normal distributions with different variance, different means, and opposite slant parameters:\n")
cat("Resulting p-value should be low!\n")
cat("With var:")
replicate(5, permutation.test(x, y, var)$pvalue)
cat("With sd:")
replicate(5, permutation.test(x, y, sd)$pvalue)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment