Skip to content

Instantly share code, notes, and snippets.

@erikgregorywebb
Created January 6, 2021 23:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save erikgregorywebb/557b798f864384d3c532e02d8c1c5939 to your computer and use it in GitHub Desktop.
Save erikgregorywebb/557b798f864384d3c532e02d8c1c5939 to your computer and use it in GitHub Desktop.
library(tidyverse)
library(rvest)
library(tools)
library(fuzzyjoin)
# extract list of programs by state (source: mastersindatascience.org)
url = 'https://www.mastersindatascience.org/schools/'
page = read_html(url)
states = page %>% html_nodes('.row') %>% html_nodes('a') %>% html_attr('href')
# loop over state links to extract list of schools and programs
datalist = list()
n = 1
for (i in 1:length(states)) {
Sys.sleep(1)
url = states[i]
page = read_html(url)
schools = page %>% html_nodes('.schoolinfo')
for (j in 1:length(schools)) {
school_name = schools[j] %>% html_node('.schoolname') %>% html_text()
school_location = schools[j] %>% html_node('.citystate') %>% html_text()
programs = schools[j] %>% html_node('.schoolprogram')
for (k in 1:length(programs)) {
program_name = programs[k] %>% html_node('h4') %>% html_text()
program_url = programs[k] %>% html_node('h4') %>% html_node('a') %>% html_attr('href')
print(paste(states[i], school_name, school_location, program_name, program_url), sep = ' | ')
datalist[[n]] = tibble(state_url = states[i], school_name, school_location, program_name, program_url)
n = n + 1
}
}
}
raw = do.call(rbind, datalist)
# clean, result is list of US data science / analytics grad-level programs
ds_programs = raw %>%
rename(state_url = `states[i]`) %>%
mutate(state = toTitleCase(basename(state_url))) %>% glimpse() %>%
select(state, school_name, school_location, program_name, program_url)
# scrape list of Historically Black Colleges and Universities (source: www.thehundred-seven.org/hbculist.html)
url = 'http://www.thehundred-seven.org/hbculist.html'
page = read_html(url)
raw = page %>% html_nodes('p:nth-child(4)') %>% html_nodes('a') %>% html_text()
hbcus = tibble(school_name = trimws(raw))
#hbcus %>% mutate(school_name = gsub("\\s*\\([^\\)]+\\)","",as.character(school_name)))
# fuzzy join
ds_programs_join = fuzzyjoin::stringdist_left_join(x = ds_programs, y = hbcus, by = 'school_name', max_dist = 1)
ds_programs_join %>% filter(is.na(school_name.y) == F)
@erikgregorywebb
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment