Skip to content
{{ message }}

Instantly share code, notes, and snippets.

# alexhanna/rupaul.R

Last active Dec 14, 2015
RuPaul's drag race
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 library(RCurl) library(ggplot2) library(survival) ## plot Kaplan-Meier curve plotKM <- function(df, metric) { ## make age categorical df\$Age[df\$Age < 25] <- 1 df\$Age[df\$Age >= 25 & df\$Age < 31] <- 2 df\$Age[df\$Age >= 31] <- 3 t.surv <- with(df, Surv(Start, End, Out)) t.fit <- survfit(t.surv ~ df[[ metric ]]) ## a lot of this coming from here: http://www.ceb-institute.org/bbs/wp-content/uploads/2011/09/handout_ggplot2.pdf ## create strata vector t.strata <- NULL for(f.i in 1:length(t.fit\$strata)) { # add vector for one strata according to number of rows of strata t.strata <- c(t.strata, rep(names(t.fit\$strata)[f.i], t.fit\$strata[f.i])) } ## create a data frame from fit data t.survframe <- data.frame( time = t.fit\$time, strata = t.strata, surv = t.fit\$surv, censor = t.fit\$n.censor ) ## plot p <- ggplot(t.survframe) p <- p + geom_step(aes(time, surv, color = strata), direction="hv") ## use points for censors p <- p + geom_point(aes(time, surv, color = strata), data = subset(t.survframe, censor == 1)) if (metric == "Age") { p <- p + scale_colour_discrete( name="Age", labels=c("< 25", "25 - 30", "> 30")) } else if (metric == "Wins") { p <- p + scale_colour_discrete(name = metric, labels = c("0", "1", "2", "3", "4")) } else if (metric == "Lipsyncs") { p <- p + scale_colour_discrete(name = metric, labels = c("0", "1", "2", "3")) } ggsave(p, filename = paste("km-", metric, ".png", sep = ""), width = 10, height = 8) } plotKM(df, "Age") plotKM(df, "Wins") plotKM(df, "Lipsyncs") myCsv <- getURL("https://docs.google.com/spreadsheet/pub?key=0Avriupr1jL4kdEVRcklVX3J0bkJibEswbVdiZ0pnM0E&output=csv") raw <- read.csv(textConnection(myCsv)) currentSeason <- 5 ## replace NAs with 0 raw[is.na(raw)] <- 0 raw\$CompLeft <- 0 ## control for the number of competitors left for (i in 1:currentSeason) { df.sub <- raw[which(raw\$Season == i), ] cl <- max(df.sub\$Place) - df.sub\$End raw[which(raw\$Season == i), ]\$CompLeft <- cl ## fix for final round if (i < currentSeason) { raw[which(raw\$Season == i & raw\$Place <= 3),]\$CompLeft <- 3 } } ## Calculate a lipsync variable such that it only counts for the person who stayed in. ## Outs are not counted for those who ended in the top three of each season. raw\$LipsyncWithoutOut <- raw\$Lipsyncs raw[which(raw\$Out == 1 & raw\$Place >= 3),]\$LipsyncWithoutOut <- raw[which(raw\$Out == 1 & raw\$Place >= 3),]\$Lipsyncs - 1 ## Willam was disqualified and therefore did not leave with a lipsync raw[which(raw\$Name == "Willam" & raw\$Out == 1),]\$LipsyncWithoutOut <- subset(raw, Name == "Willam" & Out == 1)\$Lipsyncs ## restrict the data the model is trained on to first four seasons df <- raw[which(raw\$Season < 5),] ## Survival object t.surv <- with(df, Surv(Start, End, Out)) ## time invariant models t.cox1 <- coxph(t.surv ~ (Age + PlusSize + PuertoRico), df) ## time variant models. need to use robust to account for temporal dependence t.cox2 <- coxph(t.surv ~ (Age + PlusSize + PuertoRico + Wins + Highs + Lows + Lipsyncs) + cluster(ID), df) t.cox3 <- coxph(t.surv ~ (Age + PlusSize + PuertoRico + Wins + Highs + Lows + LipsyncWithoutOut) + cluster(ID), df) ## Prediction currentEpisode <- 6 ## use the fifth season to predict outcomes dp <- raw[which(raw\$Season == 5),] dp[is.na(dp)] <- 0 # do the lipsync correction dp\$LipsyncWithoutOut <- dp\$Lipsyncs dp[which(dp\$Out == 1 & dp\$Place > 3),]\$LipsyncWithoutOut <- dp[which(dp\$Out == 1 & dp\$Place > 3),]\$Lipsyncs - 1 ## generate the relative risk with preferred model and merge it with the existing data t.predict <- predict(t.cox2, dp, "risk", se.fit = T) dp\$relRisk <- t.predict\$fit dp\$seRelRisk <- t.predict\$se.fit ## clean up the data frame and display the ordered relative risks predictions <- dp[which(dp\$End == currentEpisode & dp\$Out == 0),] ordered <- predictions[with(predictions, order(relRisk)),] ordered <- data.frame(Name = ordered\$Name, relRisk = ordered\$relRisk, se.relRisk = ordered\$seRelRisk)
to join this conversation on GitHub. Already have an account? Sign in to comment