Skip to content

Instantly share code, notes, and snippets.

@MichaelChirico
Last active March 30, 2016 22:38
Show Gist options
  • Save MichaelChirico/ee258dd04e552aa1438bf408308f84bc to your computer and use it in GitHub Desktop.
Save MichaelChirico/ee258dd04e552aa1438bf408308f84bc to your computer and use it in GitHub Desktop.
scraping private school demographic info
library(rvest)
library(data.table)
URL1 <- paste0("http://greatphillyschools.org/",
"schools?Public%20Special%20Admission=0&",
"Public%20District=0&Public%20Charter=0")
URL2 <- gsub("schools?", "schools?page=2&", URL1, fixed = TRUE)
urls1 <- html(URL1) %>%
html_nodes(xpath=paste0('/html/body/div[1]/div/div/',
'div[1]/div[3]/div/div[4]')) %>%
html_nodes("a") %>% html_attr("href")
urls1 <- urls1[grepl("schools/", urls1) & !grepl("Special", urls1)]
urls2 <- html(URL2) %>%
html_nodes(xpath=paste0('/html/body/div[1]/div/div/',
'div[1]/div[3]/div/div[4]')) %>%
html_nodes("a") %>% html_attr("href")
urls2 <- urls2[grepl("schools/", urls2) & !grepl("Special", urls2)]
urls <- paste0("http://greatphillyschools.org", c(urls1, urls2))
NN <- length(urls)
data <- data.table(name = character(NN),
enrol = integer(NN),
pct_w = numeric(NN),
pct_b = numeric(NN),
pct_h = numeric(NN),
pct_a = numeric(NN))
jj <- names(data)
xpath_main <- '/html/body/div[1]/div[1]/div/div[1]/'
name_xpath <- paste0(xpath_main, 'div[3]/div[1]/div[1]')
enro_xpath <- paste0(xpath_main, 'div[3]/div[6]/div[1]/ul/li[4]')
demo_xpath <- paste0(xpath_main, 'div[4]/div[1]/div/div/div[1]/div/div/div[2]')
for (ii in 1:length(urls)){
sch_page <- html(urls[ii])
demo_try <- sch_page %>% html_node(xpath=demo_xpath)
if (is.null(demo_try)) next else{
demos <- unique(gsub("^\\s*|\\s*$", "",
demo_try %>%
html_nodes("div") %>% html_text))
demos <- demos[demos!=""]
pct_white <-
as.numeric(gsub("[A-Za-z%]", "", demos[grepl("White", demos)]))
if (!length(pct_white)) pct_white <- 0
pct_black <-
as.numeric(gsub("[A-Za-z%]", "", demos[grepl("Black", demos)]))
if (!length(pct_black)) pct_black <- 0
pct_hisp <-
as.numeric(gsub("[A-Za-z%]", "", demos[grepl("Hispanic", demos)]))
if (!length(pct_hisp)) pct_hisp <- 0
pct_asia <-
as.numeric(gsub("[A-Za-z%]", "", demos[grepl("Asian", demos)]))
if (!length(pct_asia)) pct_asia <- 0
data[ii, (jj) := .(sch_page %>%
html_nodes(xpath=name_xpath) %>%
html_nodes("span") %>% html_text(),
as.integer(gsub("[A-Za-z]","",
sch_page %>%
html_nodes(xpath=enro_xpath) %>%
html_text())),
pct_white, pct_black, pct_hisp, pct_asia)]
}
}
data[enrol>0, sum(enrol*pct_w/100)/sum(enrol)]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment