alexhanna/rupaul.R

## rupaul.R
library(RCurl)
library(ggplot2)
library(survival)

## plot Kaplan-Meier curve
plotKM <- function(df, metric) {
	## make age categorical
	df$Age[df$Age < 25] <- 1
	df$Age[df$Age >= 25 & df$Age < 31] <- 2
	df$Age[df$Age >= 31] <- 3

	t.surv  <- with(df, Surv(Start, End, Out))
	t.fit   <- survfit(t.surv ~ df[[ metric ]])

	## a lot of this coming from here: http://www.ceb-institute.org/bbs/wp-content/uploads/2011/09/handout_ggplot2.pdf
	## create strata vector
	t.strata <- NULL
	for(f.i in 1:length(t.fit$strata)) {
		# add vector for one strata according to number of rows of strata
		t.strata <- c(t.strata, rep(names(t.fit$strata)[f.i], t.fit$strata[f.i]))
	}

	## create a data frame from fit data
	t.survframe <- data.frame( time = t.fit$time, strata = t.strata, surv = t.fit$surv, censor = t.fit$n.censor )

	## plot
	p <- ggplot(t.survframe)
	p <- p + geom_step(aes(time, surv, color = strata), direction="hv")

	## use points for censors
	p <- p + geom_point(aes(time, surv, color = strata), data = subset(t.survframe, censor == 1))

	if (metric == "Age") {
		p <- p + scale_colour_discrete(
                       name="Age",
                       labels=c("< 25", "25 - 30", "> 30"))
	} else if (metric == "Wins") {
		p <- p + scale_colour_discrete(name = metric, labels = c("0", "1", "2", "3", "4"))
	} else if (metric == "Lipsyncs") {
		p <- p + scale_colour_discrete(name = metric, labels = c("0", "1", "2", "3"))
	}

	ggsave(p, filename = paste("km-", metric, ".png", sep = ""), width = 10, height = 8)
}

plotKM(df, "Age")
plotKM(df, "Wins")
plotKM(df, "Lipsyncs")

myCsv <- getURL("https://docs.google.com/spreadsheet/pub?key=0Avriupr1jL4kdEVRcklVX3J0bkJibEswbVdiZ0pnM0E&output=csv")
raw   <- read.csv(textConnection(myCsv))

currentSeason <- 5

## replace NAs with 0
raw[is.na(raw)] <- 0

raw$CompLeft <- 0

## control for the number of competitors left
for (i in 1:currentSeason) {
	df.sub <- raw[which(raw$Season == i), ]
	cl     <- max(df.sub$Place) - df.sub$End
	raw[which(raw$Season == i), ]$CompLeft <- cl

	## fix for final round
	if (i < currentSeason) {
		raw[which(raw$Season == i & raw$Place <= 3),]$CompLeft <- 3
	}
}

## Calculate a lipsync variable such that it only counts for the person who stayed in.
## Outs are not counted for those who ended in the top three of each season.
raw$LipsyncWithoutOut <- raw$Lipsyncs
raw[which(raw$Out == 1 & raw$Place >= 3),]$LipsyncWithoutOut <- raw[which(raw$Out == 1 & raw$Place >= 3),]$Lipsyncs - 1

## Willam was disqualified and therefore did not leave with a lipsync
raw[which(raw$Name == "Willam" & raw$Out == 1),]$LipsyncWithoutOut <- subset(raw, Name == "Willam" & Out == 1)$Lipsyncs

## restrict the data the model is trained on to first four seasons
df <- raw[which(raw$Season < 5),]

## Survival object
t.surv <- with(df, Surv(Start, End, Out))

## time invariant models
t.cox1  <- coxph(t.surv ~ (Age + PlusSize + PuertoRico), df)

## time variant models. need to use robust to account for temporal dependence
t.cox2  <- coxph(t.surv ~ (Age + PlusSize + PuertoRico + Wins + Highs + Lows + Lipsyncs) + cluster(ID), df)
t.cox3  <- coxph(t.surv ~ (Age + PlusSize + PuertoRico + Wins + Highs + Lows + LipsyncWithoutOut) + cluster(ID), df)

## Prediction
currentEpisode <- 6

## use the fifth season to predict outcomes
dp <- raw[which(raw$Season == 5),]
dp[is.na(dp)] <- 0

# do the lipsync correction
dp$LipsyncWithoutOut <- dp$Lipsyncs
dp[which(dp$Out == 1 & dp$Place > 3),]$LipsyncWithoutOut <- dp[which(dp$Out == 1 & dp$Place > 3),]$Lipsyncs - 1

## generate the relative risk with preferred model and merge it with the existing data
t.predict    <- predict(t.cox2, dp, "risk", se.fit = T)
dp$relRisk   <- t.predict$fit
dp$seRelRisk <- t.predict$se.fit

## clean up the data frame and display the ordered relative risks
predictions <- dp[which(dp$End == currentEpisode & dp$Out == 0),]
ordered     <- predictions[with(predictions, order(relRisk)),]
ordered     <- data.frame(Name = ordered$Name, relRisk = ordered$relRisk, se.relRisk = ordered$seRelRisk)
	library(RCurl)
	library(ggplot2)
	library(survival)

	## plot Kaplan-Meier curve
	plotKM <- function(df, metric) {
	## make age categorical
	df$Age[df$Age < 25] <- 1
	df$Age[df$Age >= 25 & df$Age < 31] <- 2
	df$Age[df$Age >= 31] <- 3

	t.surv <- with(df, Surv(Start, End, Out))
	t.fit <- survfit(t.surv ~ df[[ metric ]])

	## a lot of this coming from here: http://www.ceb-institute.org/bbs/wp-content/uploads/2011/09/handout_ggplot2.pdf
	## create strata vector
	t.strata <- NULL
	for(f.i in 1:length(t.fit$strata)) {
	# add vector for one strata according to number of rows of strata
	t.strata <- c(t.strata, rep(names(t.fit$strata)[f.i], t.fit$strata[f.i]))
	}

	## create a data frame from fit data
	t.survframe <- data.frame( time = t.fit$time, strata = t.strata, surv = t.fit$surv, censor = t.fit$n.censor )

	## plot
	p <- ggplot(t.survframe)
	p <- p + geom_step(aes(time, surv, color = strata), direction="hv")

	## use points for censors
	p <- p + geom_point(aes(time, surv, color = strata), data = subset(t.survframe, censor == 1))

	if (metric == "Age") {
	p <- p + scale_colour_discrete(
	name="Age",
	labels=c("< 25", "25 - 30", "> 30"))
	} else if (metric == "Wins") {
	p <- p + scale_colour_discrete(name = metric, labels = c("0", "1", "2", "3", "4"))
	} else if (metric == "Lipsyncs") {
	p <- p + scale_colour_discrete(name = metric, labels = c("0", "1", "2", "3"))
	}

	ggsave(p, filename = paste("km-", metric, ".png", sep = ""), width = 10, height = 8)
	}

	plotKM(df, "Age")
	plotKM(df, "Wins")
	plotKM(df, "Lipsyncs")

	myCsv <- getURL("https://docs.google.com/spreadsheet/pub?key=0Avriupr1jL4kdEVRcklVX3J0bkJibEswbVdiZ0pnM0E&output=csv")
	raw <- read.csv(textConnection(myCsv))

	currentSeason <- 5

	## replace NAs with 0
	raw[is.na(raw)] <- 0

	raw$CompLeft <- 0

	## control for the number of competitors left
	for (i in 1:currentSeason) {
	df.sub <- raw[which(raw$Season == i), ]
	cl <- max(df.sub$Place) - df.sub$End
	raw[which(raw$Season == i), ]$CompLeft <- cl

	## fix for final round
	if (i < currentSeason) {
	raw[which(raw$Season == i & raw$Place <= 3),]$CompLeft <- 3
	}
	}

	## Calculate a lipsync variable such that it only counts for the person who stayed in.
	## Outs are not counted for those who ended in the top three of each season.
	raw$LipsyncWithoutOut <- raw$Lipsyncs
	raw[which(raw$Out == 1 & raw$Place >= 3),]$LipsyncWithoutOut <- raw[which(raw$Out == 1 & raw$Place >= 3),]$Lipsyncs - 1

	## Willam was disqualified and therefore did not leave with a lipsync
	raw[which(raw$Name == "Willam" & raw$Out == 1),]$LipsyncWithoutOut <- subset(raw, Name == "Willam" & Out == 1)$Lipsyncs

	## restrict the data the model is trained on to first four seasons
	df <- raw[which(raw$Season < 5),]

	## Survival object
	t.surv <- with(df, Surv(Start, End, Out))

	## time invariant models
	t.cox1 <- coxph(t.surv ~ (Age + PlusSize + PuertoRico), df)

	## time variant models. need to use robust to account for temporal dependence
	t.cox2 <- coxph(t.surv ~ (Age + PlusSize + PuertoRico + Wins + Highs + Lows + Lipsyncs) + cluster(ID), df)
	t.cox3 <- coxph(t.surv ~ (Age + PlusSize + PuertoRico + Wins + Highs + Lows + LipsyncWithoutOut) + cluster(ID), df)

	## Prediction
	currentEpisode <- 6

	## use the fifth season to predict outcomes
	dp <- raw[which(raw$Season == 5),]
	dp[is.na(dp)] <- 0

	# do the lipsync correction
	dp$LipsyncWithoutOut <- dp$Lipsyncs
	dp[which(dp$Out == 1 & dp$Place > 3),]$LipsyncWithoutOut <- dp[which(dp$Out == 1 & dp$Place > 3),]$Lipsyncs - 1

	## generate the relative risk with preferred model and merge it with the existing data
	t.predict <- predict(t.cox2, dp, "risk", se.fit = T)
	dp$relRisk <- t.predict$fit
	dp$seRelRisk <- t.predict$se.fit

	## clean up the data frame and display the ordered relative risks
	predictions <- dp[which(dp$End == currentEpisode & dp$Out == 0),]
	ordered <- predictions[with(predictions, order(relRisk)),]
	ordered <- data.frame(Name = ordered$Name, relRisk = ordered$relRisk, se.relRisk = ordered$seRelRisk)