Skip to content

Instantly share code, notes, and snippets.

@daranzolin
Created May 13, 2016 15:05
Show Gist options
  • Save daranzolin/aa4bd4898c1db7e292896efe6ca0a0e6 to your computer and use it in GitHub Desktop.
Save daranzolin/aa4bd4898c1db7e292896efe6ca0a0e6 to your computer and use it in GitHub Desktop.
library(rvest)
library(dplyr)
library(stringr)
library(readr)
#Read CSV from working directory, clean and prepare data
schools <- read_csv("schools.csv") %>%
slice(-c(2535:2539)) %>%
mutate(school = ifelse(unlist(stringr::str_extract(`Account Name`, "(\\w+)$")) %in% "School",
unlist(stringr::str_replace(`Account Name`, "School", "")),
`Account Name`),
school = gsub(" ", "-", school),
county = gsub(" ", "-", County),
district = gsub(" ", "-", District),
path = paste(county, district, school, sep = "/"))
#Create scraping function
get_enrolls_status_titlei <- function(path) {
url <- paste0("http://www.ed-data.org/school/", path)
enrolls_status_titlei <- url %>%
read_html() %>%
html_nodes(".profiledata:nth-child(7) span , .profiledata:nth-child(5) li+ li span") %>%
html_text() %>%
as.character()
return(enrolls_status_titlei)
}
#Initialize empty columns and loop through dataset
schools$enrolls <- rep(NA, nrow(schools))
schools$titlei <- rep(NA, nrow(schools))
schools$status <- rep(NA, nrow(schools))
for (i in 1:nrow(schools)) {
possibleError <- tryCatch(
vect <- get_enrolls_status_titlei(schools$path[i]),
error = function(e) e)
if(!inherits(possibleError, "error")) {
schools$titlei[i] <- ifelse(grepl("Yes", vect[3]), TRUE, FALSE)
schools$status[i] <- stringr::str_trim(vect[1])
schools$enrolls[i] <- as.numeric(vect[2])
} else {
schools$titlei[i] <- "Unknown"
schools$status[i] <- "Unknown"
schools$enrolls[i] <- "Unknown"
}
}
#Write CSV to working directory
write_csv(schools, "school_data.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment