Skip to content

Instantly share code, notes, and snippets.

@ccagrawal
Created December 15, 2015 12:52
Show Gist options
  • Save ccagrawal/f82d321bdb8f63a11bc7 to your computer and use it in GitHub Desktop.
Save ccagrawal/f82d321bdb8f63a11bc7 to your computer and use it in GitHub Desktop.
Quantifying affirmative action in law school admissions
options(stringsAsFactors = FALSE)
ScrapeLSN <- function(school, cycle) {
base.url <- 'http://SCHOOL.lawschoolnumbers.com/stats/CYCLE'
url <- gsub('SCHOOL', school, base.url)
url <- gsub('CYCLE', cycle, url)
src <- readLines(url)
src <- src[grep('pointWidth:', src):grep('<div id="container" style="width: 630px; height: 525px;"></div>', src)]
accepted <- src[2]
rejected <- src[14]
accepted <- strsplit(accepted, '\\{')[[1]]
accepted <- accepted[grepl('name:', accepted)]
rejected <- strsplit(rejected, '\\{')[[1]]
rejected <- rejected[grepl('name:', rejected)]
a.df <- data.frame(matrix(nrow = length(accepted), ncol = 4, data = 0))
colnames(a.df) <- c('LSAT', 'GPA', 'URM', 'Outcome')
a.df[grepl("\\(URM)', x: ", accepted), 'URM'] <- 1
a.df$LSAT <- as.numeric(gsub('.*x: ([0-9]*).*', '\\1', accepted))
a.df$GPA <- as.numeric(gsub('.*y: ([0-9\\.]*).*', '\\1', accepted))
a.df$Outcome <- 1
r.df <- data.frame(matrix(nrow = length(rejected), ncol = 4, data = 0))
colnames(r.df) <- c('LSAT', 'GPA', 'URM', 'Outcome')
r.df[grepl("\\(URM)', x: ", rejected), 'URM'] <- 1
r.df$LSAT <- as.numeric(gsub('.*x: ([0-9]*).*', '\\1', rejected))
r.df$GPA <- as.numeric(gsub('.*y: ([0-9\\.]*).*', '\\1', rejected))
r.df$Outcome <- 0
data <- rbind(a.df, r.df)
return(data)
}
schools <- read.csv('./schools.csv', header = FALSE)[, 1]
train.cycles <- c('1415')
test.cycles <- c('1314')
df <- data.frame(matrix(nrow = length(schools), ncol = 7, data = 0))
colnames(df) <- c('school', 'Intercept', 'LSAT', 'GPA', 'URM', 'Sample', 'Accuracy')
df$school <- schools
for (i in 1:nrow(df)) {
school <- df[i, 'school']
train.data <- data.frame()
for (cycle in train.cycles) {
train.data <- rbind(train.data, ScrapeLSN(school, cycle))
}
fit <- glm(Outcome ~ LSAT + GPA + URM, data = train.data, family = 'binomial')
df[i, c('Intercept', 'LSAT', 'GPA', 'URM')] <- fit$coef
test.data <- data.frame()
for (cycle in test.cycles) {
test.data <- rbind(test.data, ScrapeLSN(school, cycle))
}
test.data$pred <- predict(fit, newdata = test.data[, c('LSAT', 'GPA', 'URM')], type = 'response')
test.data$pred.int <- round(test.data$pred)
correct <- sum(test.data$Outcome == '1' & test.data$pred.int == 1, na.rm = TRUE) +
sum(test.data$Outcome == '0' & test.data$pred.int == 0, na.rm = TRUE)
incorrect <- sum(test.data$Outcome == '1' & test.data$pred.int == 0, na.rm = TRUE) +
sum(test.data$Outcome == '0' & test.data$pred.int == 1, na.rm = TRUE)
df[i, 'Sample'] <- (correct + incorrect)
df[i, 'Accuracy'] <- correct / (correct + incorrect)
cat(i, '/', nrow(df), '\n')
}
accuracy <- sum(df$Sample * df$Accuracy) / sum(df$Sample)
write.csv(df, 'analysis.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment