Last active
July 3, 2018 18:03
-
-
Save kdpeterson51/4a420dc98f4a427ae39e41811871bf79 to your computer and use it in GitHub Desktop.
R script for "Practical Prelude to Machine Learning for Sport"
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Clear workspace | |
rm(list = ls()) | |
# Load package | |
install.packages("mlf") | |
library(mlf) | |
# Simulated data: Figure 1A | |
cor_simulate <- function(y, r) { | |
x<-rnorm(length(y)) | |
y.res<-residuals(lm(x~y)) | |
r * sd(y.res) * y + y.res * sd(y) * sqrt(1-r^2)} | |
set.seed(133) # Reproducibility | |
y2<-rnorm(45, sd=10) | |
r<-0.40 # Desired correlation | |
data<-data.frame(x=as.vector(sapply(r, function(r) cor_simulate(y2, r))),y=rep(y2,length(r))) | |
plot(data$x,data$y,pch = 16,xaxt="n",yaxt="n",ann=F) | |
abline(lm(data$y~data$x),col="red") | |
# Simulated data: Figure 1B | |
set.seed(6) # Reproducibility | |
x<-c(-20:20) | |
y<-(x**2+rnorm(41,0,60)) | |
plot(y~x,pch=16,xaxt="n",yaxt="n",ann=F) | |
abline(lm(y~x),col="red") | |
# Correlational derivatives Figure 1A | |
round(stats::cor(data$x,data$y),2) # (Double-check) | |
round(mlf::distcorr(data$x,data$y),2) # (should be simialr to Pearson r) | |
round(mlf::mic(data$x,data$y),2) | |
# Correlational derivatives Figure 1B | |
round(stats::cor(x,y),2) | |
round(mlf::distcorr(x,y),2) | |
round(mlf::mic(x,y),2) | |
round(mlf::boot(x,y,mlf::mic),2) # Confidence intervals | |
# Simulated binomial distribution | |
dist1 <- c(1,1,1,1,1,1,1,1,1,0) # 90/10 | |
dist2 <- c(1,1,1,1,1,1,0,0,0,0) # 60/40 | |
round(mlf::entropy(dist1,2),2) | |
round(mlf::entropy(dist2,2),2) | |
##### Computational "sieve" to detect meaningful variables ##### | |
data("mtcars") # Load sample data set | |
predictors<-data.frame(mic=NA,variable=NA) # Create empty data frame | |
for(i in 2:length(mtcars)){ | |
# Let's say we want to predict mpg | |
# Fill each row with pairwise MIC and corresponding variable name | |
predictors[i,]<-cbind(mlf::mic(mtcars$mpg,mtcars[i]),colnames(mtcars[i]))} | |
predictors$mic<-as.numeric(predictors$mic) | |
predictors<-predictors[order(-predictors$mic),] # Sort by MIC | |
head(predictors) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment