Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
R script for "Practical Prelude to Machine Learning for Sport"
# Clear workspace
rm(list = ls())
# Load package
install.packages("mlf")
library(mlf)
# Simulated data: Figure 1A
cor_simulate <- function(y, r) {
x<-rnorm(length(y))
y.res<-residuals(lm(x~y))
r * sd(y.res) * y + y.res * sd(y) * sqrt(1-r^2)}
set.seed(133) # Reproducibility
y2<-rnorm(45, sd=10)
r<-0.40 # Desired correlation
data<-data.frame(x=as.vector(sapply(r, function(r) cor_simulate(y2, r))),y=rep(y2,length(r)))
plot(data$x,data$y,pch = 16,xaxt="n",yaxt="n",ann=F)
abline(lm(data$y~data$x),col="red")
# Simulated data: Figure 1B
set.seed(6) # Reproducibility
x<-c(-20:20)
y<-(x**2+rnorm(41,0,60))
plot(y~x,pch=16,xaxt="n",yaxt="n",ann=F)
abline(lm(y~x),col="red")
# Correlational derivatives Figure 1A
round(stats::cor(data$x,data$y),2) # (Double-check)
round(mlf::distcorr(data$x,data$y),2) # (should be simialr to Pearson r)
round(mlf::mic(data$x,data$y),2)
# Correlational derivatives Figure 1B
round(stats::cor(x,y),2)
round(mlf::distcorr(x,y),2)
round(mlf::mic(x,y),2)
round(mlf::boot(x,y,mlf::mic),2) # Confidence intervals
# Simulated binomial distribution
dist1 <- c(1,1,1,1,1,1,1,1,1,0) # 90/10
dist2 <- c(1,1,1,1,1,1,0,0,0,0) # 60/40
round(mlf::entropy(dist1,2),2)
round(mlf::entropy(dist2,2),2)
##### Computational "sieve" to detect meaningful variables #####
data("mtcars") # Load sample data set
predictors<-data.frame(mic=NA,variable=NA) # Create empty data frame
for(i in 2:length(mtcars)){
# Let's say we want to predict mpg
# Fill each row with pairwise MIC and corresponding variable name
predictors[i,]<-cbind(mlf::mic(mtcars$mpg,mtcars[i]),colnames(mtcars[i]))}
predictors$mic<-as.numeric(predictors$mic)
predictors<-predictors[order(-predictors$mic),] # Sort by MIC
head(predictors)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.