kdpeterson51/kdpeterson_SPSR.r

## kdpeterson_SPSR.r
# Clear workspace
rm(list = ls())

# Load package
install.packages("mlf")
library(mlf)


# Simulated data: Figure 1A
cor_simulate <- function(y, r) {
  x<-rnorm(length(y))
  y.res<-residuals(lm(x~y))
  r * sd(y.res) * y + y.res * sd(y) * sqrt(1-r^2)}

set.seed(133) # Reproducibility
y2<-rnorm(45, sd=10)
r<-0.40 # Desired correlation
data<-data.frame(x=as.vector(sapply(r, function(r) cor_simulate(y2, r))),y=rep(y2,length(r)))

plot(data$x,data$y,pch = 16,xaxt="n",yaxt="n",ann=F)
abline(lm(data$y~data$x),col="red")


# Simulated data: Figure 1B
set.seed(6) # Reproducibility
x<-c(-20:20)
y<-(x**2+rnorm(41,0,60))

plot(y~x,pch=16,xaxt="n",yaxt="n",ann=F)
abline(lm(y~x),col="red")


# Correlational derivatives Figure 1A
round(stats::cor(data$x,data$y),2) # (Double-check)
round(mlf::distcorr(data$x,data$y),2) # (should be simialr to Pearson r)
round(mlf::mic(data$x,data$y),2)


# Correlational derivatives Figure 1B
round(stats::cor(x,y),2)
round(mlf::distcorr(x,y),2)
round(mlf::mic(x,y),2)
round(mlf::boot(x,y,mlf::mic),2) # Confidence intervals


# Simulated binomial distribution
dist1 <- c(1,1,1,1,1,1,1,1,1,0) # 90/10
dist2 <- c(1,1,1,1,1,1,0,0,0,0) # 60/40

round(mlf::entropy(dist1,2),2)
round(mlf::entropy(dist2,2),2)


##### Computational "sieve" to detect meaningful variables #####
data("mtcars") # Load sample data set
predictors<-data.frame(mic=NA,variable=NA) # Create empty data frame
for(i in 2:length(mtcars)){
  # Let's say we want to predict mpg
  # Fill each row with pairwise MIC and corresponding variable name
  predictors[i,]<-cbind(mlf::mic(mtcars$mpg,mtcars[i]),colnames(mtcars[i]))}

predictors$mic<-as.numeric(predictors$mic)
predictors<-predictors[order(-predictors$mic),] # Sort by MIC
head(predictors)
	# Clear workspace
	rm(list = ls())

	# Load package
	install.packages("mlf")
	library(mlf)



	# Simulated data: Figure 1A
	cor_simulate <- function(y, r) {
	x<-rnorm(length(y))
	y.res<-residuals(lm(x~y))
	r * sd(y.res) * y + y.res * sd(y) * sqrt(1-r^2)}

	set.seed(133) # Reproducibility
	y2<-rnorm(45, sd=10)
	r<-0.40 # Desired correlation
	data<-data.frame(x=as.vector(sapply(r, function(r) cor_simulate(y2, r))),y=rep(y2,length(r)))

	plot(data$x,data$y,pch = 16,xaxt="n",yaxt="n",ann=F)
	abline(lm(data$y~data$x),col="red")



	# Simulated data: Figure 1B
	set.seed(6) # Reproducibility
	x<-c(-20:20)
	y<-(x**2+rnorm(41,0,60))

	plot(y~x,pch=16,xaxt="n",yaxt="n",ann=F)
	abline(lm(y~x),col="red")



	# Correlational derivatives Figure 1A
	round(stats::cor(data$x,data$y),2) # (Double-check)
	round(mlf::distcorr(data$x,data$y),2) # (should be simialr to Pearson r)
	round(mlf::mic(data$x,data$y),2)



	# Correlational derivatives Figure 1B
	round(stats::cor(x,y),2)
	round(mlf::distcorr(x,y),2)
	round(mlf::mic(x,y),2)
	round(mlf::boot(x,y,mlf::mic),2) # Confidence intervals



	# Simulated binomial distribution
	dist1 <- c(1,1,1,1,1,1,1,1,1,0) # 90/10
	dist2 <- c(1,1,1,1,1,1,0,0,0,0) # 60/40

	round(mlf::entropy(dist1,2),2)
	round(mlf::entropy(dist2,2),2)



	##### Computational "sieve" to detect meaningful variables #####
	data("mtcars") # Load sample data set
	predictors<-data.frame(mic=NA,variable=NA) # Create empty data frame
	for(i in 2:length(mtcars)){
	# Let's say we want to predict mpg
	# Fill each row with pairwise MIC and corresponding variable name
	predictors[i,]<-cbind(mlf::mic(mtcars$mpg,mtcars[i]),colnames(mtcars[i]))}

	predictors$mic<-as.numeric(predictors$mic)
	predictors<-predictors[order(-predictors$mic),] # Sort by MIC
	head(predictors)