Created
March 23, 2014 21:40
-
-
Save emilyhoughkovacs/9730315 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#load data | |
setwd("/Users/emilyhoughkovacs/Documents/kaggle/") | |
data_folder <- './' | |
regular_season_results <- read.csv(paste(data_folder,"regular_season_results.csv",sep="")) | |
sample_submission <- read.csv(paste(data_folder,"sample_submission.csv",sep="")) | |
seasons <- read.csv(paste(data_folder,"seasons.csv",sep="")) | |
teams <- read.csv(paste(data_folder,"teams.csv",sep="")) | |
tourney_results <- read.csv(paste(data_folder,"tourney_results.csv",sep="")) | |
tourney_seeds <- read.csv(paste(data_folder,"tourney_seeds.csv",sep="")) | |
tourney_slots <- read.csv(paste(data_folder,"tourney_slots.csv",sep="")) | |
solution <- read.csv(paste(data_folder,"solution.csv",sep="")) | |
#cleaning up | |
seasons$dayzerodt <- as.Date(seasons$dayzero, "%m/%d/%Y") | |
solution$id.1 <- as.numeric(substr(solution$id, 3, 5)) | |
solution$id.2 <- as.numeric(substr(solution$id, 7, 9)) | |
solution$season <- factor(substr(solution$id, 1, 1), levels=levels(seasons$season)) | |
#playing around | |
sample_submission_old <- solution | |
sample_submission_old$Usage <- NULL | |
sample_submission_old$pred <- 0 | |
tourney_seeds$seedn <- as.numeric(substr(tourney_seeds$seed,2,3)) | |
logit <- function(x) {1/(1+exp(-x))} | |
seeds2014 <- subset(tourney_seeds, season=='S') | |
hist(logit(seeds2014$seedn)) | |
hist(logit(seeds2014$seedn/5)) | |
hist(logit(seeds2014$seedn/30)) | |
hist(logit(seeds2014$seedn/20)) | |
hist(logit(seeds2014$seedn/15)) | |
hist(logit(seeds2014$seedn/25)) | |
hist(logit(seeds2014$seedn/100)) | |
hist(logit(seeds2014$seedn/2)) | |
hist(logit(seeds2014$seedn)) | |
View(seeds2014) | |
hist(logit(seeds2014$seedn/10)) | |
hist(logit(seeds2014$seedn/15)) | |
hist(logit(seeds2014$seedn/20)) | |
sample_submission$id.1 <- as.numeric(substr(sample_submission$id, 3, 5)) | |
sample_submission$id.2 <- as.numeric(substr(sample_submission$id, 7, 9)) | |
?merge | |
hough.kovacs.solution <- merge(sample_submission, seeds2014[,c(4,5)], by.x ="id.1", by.y ="team") | |
hough.kovacs.solution <- merge(sample_submission, seeds2014, by.x ="id.1", by.y ="team") | |
hough.kovacs.solution$seed <- NULL | |
hough.kovacs.solution$season <- NULL | |
?rename | |
?plyr | |
install.packages("plyr") | |
require(plyr) | |
?rename | |
hough.kovacs.solution <- rename(hough.kovacs.solution, replace=c("seedn" = "seed1")) | |
View(hough.kovacs.solution) | |
#do it all over again with second team | |
hough.kovacs.solution <- merge(hough.kovacs.solution, seeds2014, by.x ="id.2", by.y ="team") | |
hough.kovacs.solution$seed <- NULL | |
hough.kovacs.solution$season <- NULL | |
hough.kovacs.solution <- rename(hough.kovacs.solution, replace=c("seedn" = "seed2")) | |
hough.kovacs.solution$diff <- hough.kovacs.solution$seed2-hough.kovacs.solution$seed1 | |
View(hough.kovacs.solution) | |
hist(hough.kovacs.solution$diff) | |
hough.kovacs.solution$pred2 <- logit(hough.kovacs.solution$diff) | |
hist(hough.kovacs.solution$pred2) | |
hist(hough.kovacs.solution$pred2/20) | |
tourney_results$numot <- NULL | |
tourney_results$wscore <- NULL | |
tourney_results$lscore <- NULL | |
tourney_results$daynum <- NULL | |
View(tourney_results) | |
View(tourney_seeds) | |
tourney_seeds$seed <- NULL | |
View(tourney_seeds) | |
?merge | |
#merge historical tourney results with seed data | |
tourney_results <- merge(tourney_results, tourney_seeds, by.x =c("season", "wteam"), by.y =c("season", "team")) | |
tourney_results <- merge(tourney_results, tourney_seeds, by.x =c("season", "lteam"), by.y =c("season", "team")) | |
tourney_results <- rename(tourney_results, replace=c("seedn.x" = "seedw", "seedn.y"="seedl")) | |
tourney_results$won[tourney_results$wteam-tourney_results$lteam<0] <- 1 | |
tourney_results$won[tourney_results$wteam-tourney_results$lteam>=0] <- 0 | |
View(tourney_results) | |
tourney_results$seedl-tourney_results$seedw | |
tourney_results$diff[tourney_results$won==1] | |
tourney_results$won==1 | |
tourney_results[tourney_results$won==1,] | |
tourney_results$wteam[tourney_results$won==1,] | |
tourney_results$wteam[tourney_results$won==1] | |
#find difference in seeds | |
tourney_results$diff[tourney_results$won==1] <- tourney_results$seedl[tourney_results$won==1]-tourney_results$seedw[tourney_results$won==1] | |
tourney_results$diff[tourney_results$won==0] <- tourney_results$seedw[tourney_results$won==0]-tourney_results$seedl[tourney_results$won==0] | |
View(tourney_results) | |
?glm | |
model <- glm(won ~ diff, data=tourney_results, family="binomial") | |
summary(model) | |
#we have a prediction! | |
tourney_results$pred <- predict(model, type="response") | |
tourney_results$diffsq <- sign(tourney_results$diff)*tourney_results$diff^2 | |
model2 <- glm(won ~ diff + diffsq, data=tourney_results, family="binomial") | |
summary(model2) | |
tourney_results$pred2 <- predict(model2, type="response") | |
View(tourney_results) | |
hist(tourney_results$pred2) | |
hough.kovacs.solution$diffsq <- sign(hough.kovacs.solution$diff)*hough.kovacs.solution$diff^2 | |
hough.kovacs.solution$pred2 <- NULL | |
#this year's prediction! | |
hough.kovacs.solution$pred <- predict(model2, hough.kovacs.solution, type="response") | |
View(hough.kovacs.solution) | |
hist(hough.kovacs.solution$pred) | |
#correct formatting | |
hough.kovacs.solution <- hough.kovacs.solution[,c(3,4)] | |
View(hough.kovacs.solution) | |
?write | |
write.csv <- (hough.kovacs.solution, file="hough.kovacs.solution", row.names=FALSE) | |
write.csv(hough.kovacs.solution, file="hough.kovacs.solution", row.names=FALSE) | |
#play with ggplot | |
install.packages("ggplot2") | |
library(ggplot2) | |
qplot(hough.kovacs.solution$pred) | |
qplot(hough.kovacs.solution$pred)+geom_density() | |
qplot(hough.kovacs.solution$pred, geom="density") | |
savehistory("~/Documents/kaggle/marchmadness.Rhistory") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment