Instantly share code, notes, and snippets.

# /correlations.R

Created January 29, 2018 12:44
Show Gist options
• Save anonymous/fabecccf33f9c3feb568384f626a2c07 to your computer and use it in GitHub Desktop.
Code for correlations article
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 # Detecting correlation # Defines three functions using base R to illustrate techniques for identifying correlations # between continuous random variables, then tests against different types of data # Pearsons r, distance correlation, Maximal Information Coefficient (approximated) # A simple bootstrap function to estimate confidence intervals bootstrap <- function(x,y,func,reps,alpha){ estimates <- c() original <- data.frame(x,y) N <- dim(original)[1] for(i in 1:reps){ S <- original[sample(1:N, N, replace = TRUE),] estimates <- append(estimates, func(S\$x, S\$y)) } l <- alpha/2 ; u <- 1 - l interval <- quantile(estimates, c(u, l)) return(2*(func(x,y)) - as.numeric(interval[1:2])) } # Pearson's r Pearsons <- function(x,y){ x <- x - mean(x); y <- y - mean(y) dotProduct <- 0 for(i in 1:length(x)){ dotProduct <- dotProduct + (x[i] * y[i]) } magnitudeX <- sqrt(sum(x**2)) magnitudeY <- sqrt(sum(y**2)) cosTheta <- dotProduct / (magnitudeX * magnitudeY) return(round(cosTheta,4)) } # Distance Correlation distanceCorrelation<- function(x,y){ # Function to double center the data doubleCenter <- function(x){ centered <- x for(i in 1:dim(x)[1]){ for(j in 1:dim(x)[2]){ centered[i,j] = x[i,j] - mean(x[i,]) - mean(x[,j]) + mean(x) } } return(centered) } # Calculate the distance covariance distanceCovariance <- function(x,y){ N <- length(x) distX <- as.matrix(dist(x)) distY <- as.matrix(dist(y)) centeredX <- doubleCenter(distX) centeredY <- doubleCenter(distY) calc = sum(centeredX * centeredY) return(sqrt(calc/(N^2))) } # Use distanceCovariance(x,x) to get distance Variance of x distanceVariance <- function(x){return(distanceCovariance(x,x))} # Find numerator and denominator, return distance correlation cov = distanceCovariance(x,y) sd = sqrt(distanceVariance(x)*distanceVariance(y)) return(round(cov/sd ,4)) } # Maximal Information Coefficient MIC <- function(x,y){ # Get joint distribution of x and y jointDist <- function(x,y){ N <- length(x) u <- unique(append(x,y)) joint <- c() for(i in u){ for(j in u){ f <- x[paste(x,y) == paste(i,j)] joint <- append(joint, length(f)/N) } } return(joint) } # Calculate the product of the marginal distributions of x and y marginalProduct <- function(x,y){ N <- length(x) u <- unique(append(x,y)) marginal <- c() for(i in u){ for(j in u){ fX <- length(x[x == i]) / N fY <- length(y[y == j]) / N marginal <- append(marginal, fX * fY) } } return(marginal) } # Mutual Information of x and y mutualInfo <- function(x,y){ joint <- jointDist(x,y) marginal <- marginalProduct(x,y) Hjm <- - sum(joint[marginal > 0] * log(marginal[marginal > 0],2)) Hj <- - sum(joint[joint > 0] * log(joint[joint > 0],2)) return(Hjm - Hj) } # Start binning x and y to find MIC N <- length(x) maxBins <- ceiling(N ** 0.6) MI <- c() for(i in 2:maxBins) { for (j in 2:maxBins){ if(i * j > maxBins){ next } Xbins <- i; Ybins <- j binnedX <-cut(x, breaks=Xbins, labels = 1:Xbins) binnedY <-cut(y, breaks=Ybins, labels = 1:Ybins) MI_estimate <- mutualInfo(binnedX,binnedY) MI_normalized <- MI_estimate / log(min(Xbins,Ybins),2) MI <- append(MI, MI_normalized) } } return(round(max(MI),4)) } # Test data set.seed(123) # Noise x0 <- rnorm(100,0,1) y0 <- rnorm(100,0,1) plot(y0~x0, pch =18) cor(x0,y0) distanceCorrelation(x0,y0) MIC(x0,y0) # Simple linear relationship x1 <- -20:20 y1 <- x1 + rnorm(41,0,4) plot(y1~x1, pch =18) Pearsons(x1,y1) distanceCorrelation(x1,y1) MIC(x1,y1) # y ~ x**2 x2 <- -20:20 y2 <- x2**2 + rnorm(41,0,40) plot(y2~x2, pch = 18) Pearsons(x2,y2) distanceCorrelation(x2,y2) MIC(x2,y2) # Cosine x3 <- -20:20 y3 <- cos(x3/4) + rnorm(41,0,0.2) plot(y3~x3, type='p', pch=18) abline(nls(y3~cos(x3/4)),col='red',lwd=2) cor(x3,y3) Pearsons(x3,y3) distanceCorrelation(x3,y3) MIC(x3,y3) # Circle n <- 50 theta <- runif (n, 0, 2*pi) x4 <- append(cos (theta) , cos(theta)) y4 <- append( sin (theta), -sin (theta)) plot(x4,y4, pch=18) Pearsons(x4,y4) distanceCorrelation(x4,y4) MIC(x4,y4) cor(x4,y4)

### lu2608 commented May 10, 2022

Hi, I have used the given code to compute the distance correlation of my two generated variables in basic R. Although I have adjusted the chunk to my specific variables R will not give me a result but only defines the function - What might be the problem here?