ccagrawal/lsn_scraper.R

## lsn_scraper.R
options(stringsAsFactors = FALSE)

ScrapeLSN <- function(school, cycle) {

  base.url <- 'http://SCHOOL.lawschoolnumbers.com/stats/CYCLE'

  url <- gsub('SCHOOL', school, base.url)
  url <- gsub('CYCLE', cycle, url)

  src <- readLines(url)
  src <- src[grep('pointWidth:', src):grep('<div id="container" style="width: 630px; height: 525px;"></div>', src)]

  accepted <- src[2]
  rejected <- src[14]

  accepted <- strsplit(accepted, '\\{')[[1]]
  accepted <- accepted[grepl('name:', accepted)]

  rejected <- strsplit(rejected, '\\{')[[1]]
  rejected <- rejected[grepl('name:', rejected)]

  a.df <- data.frame(matrix(nrow = length(accepted), ncol = 4, data = 0))
  colnames(a.df) <- c('LSAT', 'GPA', 'URM', 'Outcome')
  a.df[grepl("\\(URM)', x: ", accepted), 'URM'] <- 1
  a.df$LSAT <- as.numeric(gsub('.*x: ([0-9]*).*', '\\1', accepted))
  a.df$GPA <- as.numeric(gsub('.*y: ([0-9\\.]*).*', '\\1', accepted))
  a.df$Outcome <- 1

  r.df <- data.frame(matrix(nrow = length(rejected), ncol = 4, data = 0))
  colnames(r.df) <- c('LSAT', 'GPA', 'URM', 'Outcome')
  r.df[grepl("\\(URM)', x: ", rejected), 'URM'] <- 1
  r.df$LSAT <- as.numeric(gsub('.*x: ([0-9]*).*', '\\1', rejected))
  r.df$GPA <- as.numeric(gsub('.*y: ([0-9\\.]*).*', '\\1', rejected))
  r.df$Outcome <- 0

  data <- rbind(a.df, r.df)
  return(data)
}

schools <- read.csv('./schools.csv', header = FALSE)[, 1]
train.cycles <- c('1415')
test.cycles <- c('1314')

df <- data.frame(matrix(nrow = length(schools), ncol = 7, data = 0))
colnames(df) <- c('school', 'Intercept', 'LSAT', 'GPA', 'URM', 'Sample', 'Accuracy')
df$school <- schools

for (i in 1:nrow(df)) {

  school <- df[i, 'school']

  train.data <- data.frame()
  for (cycle in train.cycles) {
    train.data <- rbind(train.data, ScrapeLSN(school, cycle))
  }

  fit <- glm(Outcome ~ LSAT + GPA + URM, data = train.data, family = 'binomial')
  df[i, c('Intercept', 'LSAT', 'GPA', 'URM')] <- fit$coef

  test.data <- data.frame()
  for (cycle in test.cycles) {
    test.data <- rbind(test.data, ScrapeLSN(school, cycle))
  }

  test.data$pred <- predict(fit, newdata = test.data[, c('LSAT', 'GPA', 'URM')], type = 'response')
  test.data$pred.int <- round(test.data$pred)

  correct <- sum(test.data$Outcome == '1' & test.data$pred.int == 1, na.rm = TRUE) +
    sum(test.data$Outcome == '0' & test.data$pred.int == 0, na.rm = TRUE)
  incorrect <- sum(test.data$Outcome == '1' & test.data$pred.int == 0, na.rm = TRUE) +
    sum(test.data$Outcome == '0' & test.data$pred.int == 1, na.rm = TRUE)
  df[i, 'Sample'] <- (correct + incorrect)
  df[i, 'Accuracy'] <- correct / (correct + incorrect)

  cat(i, '/', nrow(df), '\n')
}

accuracy <- sum(df$Sample * df$Accuracy) / sum(df$Sample)

write.csv(df, 'analysis.csv')
	options(stringsAsFactors = FALSE)

	ScrapeLSN <- function(school, cycle) {

	base.url <- 'http://SCHOOL.lawschoolnumbers.com/stats/CYCLE'

	url <- gsub('SCHOOL', school, base.url)
	url <- gsub('CYCLE', cycle, url)

	src <- readLines(url)
	src <- src[grep('pointWidth:', src):grep('<div id="container" style="width: 630px; height: 525px;"></div>', src)]

	accepted <- src[2]
	rejected <- src[14]

	accepted <- strsplit(accepted, '\\{')[[1]]
	accepted <- accepted[grepl('name:', accepted)]

	rejected <- strsplit(rejected, '\\{')[[1]]
	rejected <- rejected[grepl('name:', rejected)]

	a.df <- data.frame(matrix(nrow = length(accepted), ncol = 4, data = 0))
	colnames(a.df) <- c('LSAT', 'GPA', 'URM', 'Outcome')
	a.df[grepl("\\(URM)', x: ", accepted), 'URM'] <- 1
	a.df$LSAT <- as.numeric(gsub('.x: ([0-9]).*', '\\1', accepted))
	a.df$GPA <- as.numeric(gsub('.y: ([0-9\\.]).*', '\\1', accepted))
	a.df$Outcome <- 1

	r.df <- data.frame(matrix(nrow = length(rejected), ncol = 4, data = 0))
	colnames(r.df) <- c('LSAT', 'GPA', 'URM', 'Outcome')
	r.df[grepl("\\(URM)', x: ", rejected), 'URM'] <- 1
	r.df$LSAT <- as.numeric(gsub('.x: ([0-9]).*', '\\1', rejected))
	r.df$GPA <- as.numeric(gsub('.y: ([0-9\\.]).*', '\\1', rejected))
	r.df$Outcome <- 0

	data <- rbind(a.df, r.df)
	return(data)
	}

	schools <- read.csv('./schools.csv', header = FALSE)[, 1]
	train.cycles <- c('1415')
	test.cycles <- c('1314')

	df <- data.frame(matrix(nrow = length(schools), ncol = 7, data = 0))
	colnames(df) <- c('school', 'Intercept', 'LSAT', 'GPA', 'URM', 'Sample', 'Accuracy')
	df$school <- schools

	for (i in 1:nrow(df)) {

	school <- df[i, 'school']

	train.data <- data.frame()
	for (cycle in train.cycles) {
	train.data <- rbind(train.data, ScrapeLSN(school, cycle))
	}

	fit <- glm(Outcome ~ LSAT + GPA + URM, data = train.data, family = 'binomial')
	df[i, c('Intercept', 'LSAT', 'GPA', 'URM')] <- fit$coef

	test.data <- data.frame()
	for (cycle in test.cycles) {
	test.data <- rbind(test.data, ScrapeLSN(school, cycle))
	}

	test.data$pred <- predict(fit, newdata = test.data[, c('LSAT', 'GPA', 'URM')], type = 'response')
	test.data$pred.int <- round(test.data$pred)

	correct <- sum(test.data$Outcome == '1' & test.data$pred.int == 1, na.rm = TRUE) +
	sum(test.data$Outcome == '0' & test.data$pred.int == 0, na.rm = TRUE)
	incorrect <- sum(test.data$Outcome == '1' & test.data$pred.int == 0, na.rm = TRUE) +
	sum(test.data$Outcome == '0' & test.data$pred.int == 1, na.rm = TRUE)
	df[i, 'Sample'] <- (correct + incorrect)
	df[i, 'Accuracy'] <- correct / (correct + incorrect)

	cat(i, '/', nrow(df), '\n')
	}

	accuracy <- sum(df$Sample * df$Accuracy) / sum(df$Sample)

	write.csv(df, 'analysis.csv')