Skip to content

Instantly share code, notes, and snippets.

@emilyhoughkovacs
Created March 23, 2014 21:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save emilyhoughkovacs/9730315 to your computer and use it in GitHub Desktop.
Save emilyhoughkovacs/9730315 to your computer and use it in GitHub Desktop.
#load data
setwd("/Users/emilyhoughkovacs/Documents/kaggle/")
data_folder <- './'
regular_season_results <- read.csv(paste(data_folder,"regular_season_results.csv",sep=""))
sample_submission <- read.csv(paste(data_folder,"sample_submission.csv",sep=""))
seasons <- read.csv(paste(data_folder,"seasons.csv",sep=""))
teams <- read.csv(paste(data_folder,"teams.csv",sep=""))
tourney_results <- read.csv(paste(data_folder,"tourney_results.csv",sep=""))
tourney_seeds <- read.csv(paste(data_folder,"tourney_seeds.csv",sep=""))
tourney_slots <- read.csv(paste(data_folder,"tourney_slots.csv",sep=""))
solution <- read.csv(paste(data_folder,"solution.csv",sep=""))
#cleaning up
seasons$dayzerodt <- as.Date(seasons$dayzero, "%m/%d/%Y")
solution$id.1 <- as.numeric(substr(solution$id, 3, 5))
solution$id.2 <- as.numeric(substr(solution$id, 7, 9))
solution$season <- factor(substr(solution$id, 1, 1), levels=levels(seasons$season))
#playing around
sample_submission_old <- solution
sample_submission_old$Usage <- NULL
sample_submission_old$pred <- 0
tourney_seeds$seedn <- as.numeric(substr(tourney_seeds$seed,2,3))
logit <- function(x) {1/(1+exp(-x))}
seeds2014 <- subset(tourney_seeds, season=='S')
hist(logit(seeds2014$seedn))
hist(logit(seeds2014$seedn/5))
hist(logit(seeds2014$seedn/30))
hist(logit(seeds2014$seedn/20))
hist(logit(seeds2014$seedn/15))
hist(logit(seeds2014$seedn/25))
hist(logit(seeds2014$seedn/100))
hist(logit(seeds2014$seedn/2))
hist(logit(seeds2014$seedn))
View(seeds2014)
hist(logit(seeds2014$seedn/10))
hist(logit(seeds2014$seedn/15))
hist(logit(seeds2014$seedn/20))
sample_submission$id.1 <- as.numeric(substr(sample_submission$id, 3, 5))
sample_submission$id.2 <- as.numeric(substr(sample_submission$id, 7, 9))
?merge
hough.kovacs.solution <- merge(sample_submission, seeds2014[,c(4,5)], by.x ="id.1", by.y ="team")
hough.kovacs.solution <- merge(sample_submission, seeds2014, by.x ="id.1", by.y ="team")
hough.kovacs.solution$seed <- NULL
hough.kovacs.solution$season <- NULL
?rename
?plyr
install.packages("plyr")
require(plyr)
?rename
hough.kovacs.solution <- rename(hough.kovacs.solution, replace=c("seedn" = "seed1"))
View(hough.kovacs.solution)
#do it all over again with second team
hough.kovacs.solution <- merge(hough.kovacs.solution, seeds2014, by.x ="id.2", by.y ="team")
hough.kovacs.solution$seed <- NULL
hough.kovacs.solution$season <- NULL
hough.kovacs.solution <- rename(hough.kovacs.solution, replace=c("seedn" = "seed2"))
hough.kovacs.solution$diff <- hough.kovacs.solution$seed2-hough.kovacs.solution$seed1
View(hough.kovacs.solution)
hist(hough.kovacs.solution$diff)
hough.kovacs.solution$pred2 <- logit(hough.kovacs.solution$diff)
hist(hough.kovacs.solution$pred2)
hist(hough.kovacs.solution$pred2/20)
tourney_results$numot <- NULL
tourney_results$wscore <- NULL
tourney_results$lscore <- NULL
tourney_results$daynum <- NULL
View(tourney_results)
View(tourney_seeds)
tourney_seeds$seed <- NULL
View(tourney_seeds)
?merge
#merge historical tourney results with seed data
tourney_results <- merge(tourney_results, tourney_seeds, by.x =c("season", "wteam"), by.y =c("season", "team"))
tourney_results <- merge(tourney_results, tourney_seeds, by.x =c("season", "lteam"), by.y =c("season", "team"))
tourney_results <- rename(tourney_results, replace=c("seedn.x" = "seedw", "seedn.y"="seedl"))
tourney_results$won[tourney_results$wteam-tourney_results$lteam<0] <- 1
tourney_results$won[tourney_results$wteam-tourney_results$lteam>=0] <- 0
View(tourney_results)
tourney_results$seedl-tourney_results$seedw
tourney_results$diff[tourney_results$won==1]
tourney_results$won==1
tourney_results[tourney_results$won==1,]
tourney_results$wteam[tourney_results$won==1,]
tourney_results$wteam[tourney_results$won==1]
#find difference in seeds
tourney_results$diff[tourney_results$won==1] <- tourney_results$seedl[tourney_results$won==1]-tourney_results$seedw[tourney_results$won==1]
tourney_results$diff[tourney_results$won==0] <- tourney_results$seedw[tourney_results$won==0]-tourney_results$seedl[tourney_results$won==0]
View(tourney_results)
?glm
model <- glm(won ~ diff, data=tourney_results, family="binomial")
summary(model)
#we have a prediction!
tourney_results$pred <- predict(model, type="response")
tourney_results$diffsq <- sign(tourney_results$diff)*tourney_results$diff^2
model2 <- glm(won ~ diff + diffsq, data=tourney_results, family="binomial")
summary(model2)
tourney_results$pred2 <- predict(model2, type="response")
View(tourney_results)
hist(tourney_results$pred2)
hough.kovacs.solution$diffsq <- sign(hough.kovacs.solution$diff)*hough.kovacs.solution$diff^2
hough.kovacs.solution$pred2 <- NULL
#this year's prediction!
hough.kovacs.solution$pred <- predict(model2, hough.kovacs.solution, type="response")
View(hough.kovacs.solution)
hist(hough.kovacs.solution$pred)
#correct formatting
hough.kovacs.solution <- hough.kovacs.solution[,c(3,4)]
View(hough.kovacs.solution)
?write
write.csv <- (hough.kovacs.solution, file="hough.kovacs.solution", row.names=FALSE)
write.csv(hough.kovacs.solution, file="hough.kovacs.solution", row.names=FALSE)
#play with ggplot
install.packages("ggplot2")
library(ggplot2)
qplot(hough.kovacs.solution$pred)
qplot(hough.kovacs.solution$pred)+geom_density()
qplot(hough.kovacs.solution$pred, geom="density")
savehistory("~/Documents/kaggle/marchmadness.Rhistory")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment