Skip to content

Instantly share code, notes, and snippets.

@benmarwick
Created July 21, 2020 00:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save benmarwick/9d827978712f3e80250e608380e2d21b to your computer and use it in GitHub Desktop.
Save benmarwick/9d827978712f3e80250e608380e2d21b to your computer and use it in GitHub Desktop.
library(tidyverse)
library(rvest)
# what quarter and year do we want to check for the availability of DSM courses?
qrt_year <- "AUT2020"
# this is the URL to our canonical list of DSM courses
webpage <- "https://www.washington.edu/uaa/advising/single-pages/data-science-minor/"
# scrape text from DSM list ---------------------------------------
dsm_courses <-
webpage %>%
read_html() %>%
html_nodes("blockquote li") %>%
html_text()
# tidy DSM course list --------------------------------------------
# get course prefix
dsm_courses_prefix <-
dsm_courses %>%
str_match("[A-Z]+") %>%
as.vector()
# get course number
dsm_courses_number <-
dsm_courses %>%
str_match("[0-9]+") %>%
as.vector()
# search course catalog & time schedule ----------------------------
# search course catalog to get URL
crscat_url <- "https://www.washington.edu/students/crscat/"
dsm_crscat_url <- str_c(crscat_url, str_to_lower(dsm_courses_prefix), ".html")
# get URL to time schedule
timeschd_url <- "https://www.washington.edu/students/timeschd/"
# the timeschd_url prefixes are not identical to the course prefixes :(
cas_programs <-
timeschd_url %>%
str_c(qrt_year) %>%
read_html() %>%
html_nodes("ul:nth-child(34) a") %>%
html_text()
cas_program_urls <-
timeschd_url %>%
str_c(qrt_year) %>%
read_html() %>%
html_nodes("ul:nth-child(34) a") %>%
html_attr('href')
# note that we may miss some things like JSIS A
cas_program_names_and_urls <-
tibble(cas_programs = cas_programs,
cas_prefix = as.vector(str_match(cas_programs, "[A-Z]{3,4}")),
cas_program_urls = str_c(timeschd_url, qrt_year, "/", cas_program_urls)) %>%
drop_na(cas_prefix)
# get time schedule classes for prefixes listed on the dsm website
get_stuff <- function(x){
read_html(x) %>%
html_nodes("br+ table a:nth-child(1)") %>%
html_text() %>%
str_squish()
}
get_stuff_safe <- safely(get_stuff, otherwise = NA_real_)
# this will take a few seconds to get all the courses on the current time schedule
courses_from_timeschd <-
map(cas_program_names_and_urls$cas_program_urls,
~.x %>%
get_stuff_safe()
)
cas_program_names_and_urls_offered <-
cas_program_names_and_urls %>%
mutate(courses_from_timeschd_result =
transpose(courses_from_timeschd)[["result"]])
# combine into data frame -----------------------------------------------
# make a table of course prefix, number, URL to course catalog,
# and time schedule courses
dsm_courses_prefix_and_number <-
tibble(dsm_courses_prefix = dsm_courses_prefix,
dsm_courses_number = dsm_courses_number,
dsm_crscat_url = dsm_crscat_url) %>%
mutate(dsm_course_linked_text =
str_c("<a href='", dsm_crscat_url, "'>",
dsm_courses_prefix, " ",
dsm_courses_number, "</a>")) %>%
left_join(cas_program_names_and_urls_offered,
by = c("dsm_courses_prefix" = "cas_prefix"))
# check to see if the dsm course is offered in the term?
dsm_courses_prefix_and_number_offered <-
dsm_courses_prefix_and_number %>%
mutate(course_offered_yes_no =
str_detect(courses_from_timeschd_result,
dsm_courses_number))
dsm_courses_prefix_and_number_offered %>%
filter(course_offered_yes_no) %>%
distinct(dsm_course_linked_text, .keep_all = TRUE) %>%
select(dsm_courses_prefix,
dsm_courses_number,
dsm_crscat_url,
dsm_course_linked_text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment