Skip to content

Instantly share code, notes, and snippets.

Created January 29, 2018 12:44
Show Gist options
  • Star 8 You must be signed in to star a gist
  • Fork 4 You must be signed in to fork a gist
  • Save anonymous/fabecccf33f9c3feb568384f626a2c07 to your computer and use it in GitHub Desktop.
Save anonymous/fabecccf33f9c3feb568384f626a2c07 to your computer and use it in GitHub Desktop.
Code for correlations article
# Detecting correlation
# Defines three functions using base R to illustrate techniques for identifying correlations
# between continuous random variables, then tests against different types of data
# Pearsons r, distance correlation, Maximal Information Coefficient (approximated)
# A simple bootstrap function to estimate confidence intervals
bootstrap <- function(x,y,func,reps,alpha){
estimates <- c()
original <- data.frame(x,y)
N <- dim(original)[1]
for(i in 1:reps){
S <- original[sample(1:N, N, replace = TRUE),]
estimates <- append(estimates, func(S$x, S$y))
}
l <- alpha/2 ; u <- 1 - l
interval <- quantile(estimates, c(u, l))
return(2*(func(x,y)) - as.numeric(interval[1:2]))
}
# Pearson's r
Pearsons <- function(x,y){
x <- x - mean(x); y <- y - mean(y)
dotProduct <- 0
for(i in 1:length(x)){
dotProduct <- dotProduct + (x[i] * y[i])
}
magnitudeX <- sqrt(sum(x**2))
magnitudeY <- sqrt(sum(y**2))
cosTheta <- dotProduct / (magnitudeX * magnitudeY)
return(round(cosTheta,4))
}
# Distance Correlation
distanceCorrelation<- function(x,y){
# Function to double center the data
doubleCenter <- function(x){
centered <- x
for(i in 1:dim(x)[1]){
for(j in 1:dim(x)[2]){
centered[i,j] = x[i,j] - mean(x[i,]) - mean(x[,j]) + mean(x)
}
}
return(centered)
}
# Calculate the distance covariance
distanceCovariance <- function(x,y){
N <- length(x)
distX <- as.matrix(dist(x))
distY <- as.matrix(dist(y))
centeredX <- doubleCenter(distX)
centeredY <- doubleCenter(distY)
calc = sum(centeredX * centeredY)
return(sqrt(calc/(N^2)))
}
# Use distanceCovariance(x,x) to get distance Variance of x
distanceVariance <- function(x){return(distanceCovariance(x,x))}
# Find numerator and denominator, return distance correlation
cov = distanceCovariance(x,y)
sd = sqrt(distanceVariance(x)*distanceVariance(y))
return(round(cov/sd ,4))
}
# Maximal Information Coefficient
MIC <- function(x,y){
# Get joint distribution of x and y
jointDist <- function(x,y){
N <- length(x)
u <- unique(append(x,y))
joint <- c()
for(i in u){
for(j in u){
f <- x[paste(x,y) == paste(i,j)]
joint <- append(joint, length(f)/N)
}
}
return(joint)
}
# Calculate the product of the marginal distributions of x and y
marginalProduct <- function(x,y){
N <- length(x)
u <- unique(append(x,y))
marginal <- c()
for(i in u){
for(j in u){
fX <- length(x[x == i]) / N
fY <- length(y[y == j]) / N
marginal <- append(marginal, fX * fY)
}
}
return(marginal)
}
# Mutual Information of x and y
mutualInfo <- function(x,y){
joint <- jointDist(x,y)
marginal <- marginalProduct(x,y)
Hjm <- - sum(joint[marginal > 0] * log(marginal[marginal > 0],2))
Hj <- - sum(joint[joint > 0] * log(joint[joint > 0],2))
return(Hjm - Hj)
}
# Start binning x and y to find MIC
N <- length(x)
maxBins <- ceiling(N ** 0.6)
MI <- c()
for(i in 2:maxBins) {
for (j in 2:maxBins){
if(i * j > maxBins){
next
}
Xbins <- i; Ybins <- j
binnedX <-cut(x, breaks=Xbins, labels = 1:Xbins)
binnedY <-cut(y, breaks=Ybins, labels = 1:Ybins)
MI_estimate <- mutualInfo(binnedX,binnedY)
MI_normalized <- MI_estimate / log(min(Xbins,Ybins),2)
MI <- append(MI, MI_normalized)
}
}
return(round(max(MI),4))
}
# Test data
set.seed(123)
# Noise
x0 <- rnorm(100,0,1)
y0 <- rnorm(100,0,1)
plot(y0~x0, pch =18)
cor(x0,y0)
distanceCorrelation(x0,y0)
MIC(x0,y0)
# Simple linear relationship
x1 <- -20:20
y1 <- x1 + rnorm(41,0,4)
plot(y1~x1, pch =18)
Pearsons(x1,y1)
distanceCorrelation(x1,y1)
MIC(x1,y1)
# y ~ x**2
x2 <- -20:20
y2 <- x2**2 + rnorm(41,0,40)
plot(y2~x2, pch = 18)
Pearsons(x2,y2)
distanceCorrelation(x2,y2)
MIC(x2,y2)
# Cosine
x3 <- -20:20
y3 <- cos(x3/4) + rnorm(41,0,0.2)
plot(y3~x3, type='p', pch=18)
abline(nls(y3~cos(x3/4)),col='red',lwd=2)
cor(x3,y3)
Pearsons(x3,y3)
distanceCorrelation(x3,y3)
MIC(x3,y3)
# Circle
n <- 50
theta <- runif (n, 0, 2*pi)
x4 <- append(cos (theta) , cos(theta))
y4 <- append( sin (theta), -sin (theta))
plot(x4,y4, pch=18)
Pearsons(x4,y4)
distanceCorrelation(x4,y4)
MIC(x4,y4)
cor(x4,y4)
@lu2608
Copy link

lu2608 commented May 10, 2022

Hi, I have used the given code to compute the distance correlation of my two generated variables in basic R. Although I have adjusted the chunk to my specific variables R will not give me a result but only defines the function - What might be the problem here?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment