daranzolin/web_scraping.R

## web_scraping.R
library(rvest)
library(dplyr)
library(stringr)
library(readr)

#Read CSV from working directory, clean and prepare data
schools <- read_csv("schools.csv") %>%
  slice(-c(2535:2539)) %>%
  mutate(school = ifelse(unlist(stringr::str_extract(`Account Name`, "(\\w+)$")) %in% "School",
                         unlist(stringr::str_replace(`Account Name`, "School", "")),
                         `Account Name`),
         school = gsub(" ", "-", school),
         county = gsub(" ", "-", County),
         district = gsub(" ", "-", District),
         path = paste(county, district, school, sep = "/"))

#Create scraping function
get_enrolls_status_titlei <- function(path) {
  url <- paste0("http://www.ed-data.org/school/", path)
  enrolls_status_titlei <- url %>%
    read_html() %>%
    html_nodes(".profiledata:nth-child(7) span , .profiledata:nth-child(5) li+ li span") %>%
    html_text() %>%
    as.character()
  return(enrolls_status_titlei)
}

#Initialize empty columns and loop through dataset
schools$enrolls <- rep(NA, nrow(schools))
schools$titlei <- rep(NA, nrow(schools))
schools$status <- rep(NA, nrow(schools))
for (i in 1:nrow(schools)) {
  possibleError <- tryCatch(
    vect <- get_enrolls_status_titlei(schools$path[i]),
    error = function(e) e)
  if(!inherits(possibleError, "error")) {
    schools$titlei[i] <- ifelse(grepl("Yes", vect[3]), TRUE, FALSE)
    schools$status[i] <- stringr::str_trim(vect[1])
    schools$enrolls[i] <- as.numeric(vect[2])
  } else {
    schools$titlei[i] <- "Unknown"
    schools$status[i] <- "Unknown"
    schools$enrolls[i] <- "Unknown"
  }
}

#Write CSV to working directory
write_csv(schools, "school_data.csv")
	library(rvest)
	library(dplyr)
	library(stringr)
	library(readr)

	#Read CSV from working directory, clean and prepare data
	schools <- read_csv("schools.csv") %>%
	slice(-c(2535:2539)) %>%
	mutate(school = ifelse(unlist(stringr::str_extract(`Account Name`, "(\\w+)$")) %in% "School",
	unlist(stringr::str_replace(`Account Name`, "School", "")),
	`Account Name`),
	school = gsub(" ", "-", school),
	county = gsub(" ", "-", County),
	district = gsub(" ", "-", District),
	path = paste(county, district, school, sep = "/"))

	#Create scraping function
	get_enrolls_status_titlei <- function(path) {
	url <- paste0("http://www.ed-data.org/school/", path)
	enrolls_status_titlei <- url %>%
	read_html() %>%
	html_nodes(".profiledata:nth-child(7) span , .profiledata:nth-child(5) li+ li span") %>%
	html_text() %>%
	as.character()
	return(enrolls_status_titlei)
	}

	#Initialize empty columns and loop through dataset
	schools$enrolls <- rep(NA, nrow(schools))
	schools$titlei <- rep(NA, nrow(schools))
	schools$status <- rep(NA, nrow(schools))
	for (i in 1:nrow(schools)) {
	possibleError <- tryCatch(
	vect <- get_enrolls_status_titlei(schools$path[i]),
	error = function(e) e)
	if(!inherits(possibleError, "error")) {
	schools$titlei[i] <- ifelse(grepl("Yes", vect[3]), TRUE, FALSE)
	schools$status[i] <- stringr::str_trim(vect[1])
	schools$enrolls[i] <- as.numeric(vect[2])
	} else {
	schools$titlei[i] <- "Unknown"
	schools$status[i] <- "Unknown"
	schools$enrolls[i] <- "Unknown"
	}
	}

	#Write CSV to working directory
	write_csv(schools, "school_data.csv")