benmarwick/dsm-course-checking.R

## dsm-course-checking.R

library(tidyverse)
library(rvest)

# what quarter and year do we want to check for the availability of DSM courses?
qrt_year <- "AUT2020"

# this is the URL to our canonical list of DSM courses
webpage <- "https://www.washington.edu/uaa/advising/single-pages/data-science-minor/"

# scrape text from DSM list ---------------------------------------
dsm_courses <-
webpage %>%
  read_html() %>%
  html_nodes("blockquote li") %>%
  html_text()

# tidy DSM course list --------------------------------------------

# get course prefix
dsm_courses_prefix <-
dsm_courses %>%
  str_match("[A-Z]+") %>%
  as.vector()

# get course number
dsm_courses_number <-
  dsm_courses %>%
  str_match("[0-9]+") %>%
  as.vector()

# search course catalog & time schedule ----------------------------

# search course catalog to get URL
crscat_url <- "https://www.washington.edu/students/crscat/"
dsm_crscat_url <-  str_c(crscat_url, str_to_lower(dsm_courses_prefix), ".html")

# get URL to time schedule
timeschd_url <- "https://www.washington.edu/students/timeschd/"

# the timeschd_url prefixes are not identical to the course prefixes :(
cas_programs <-
timeschd_url %>%
  str_c(qrt_year) %>%
  read_html() %>%
  html_nodes("ul:nth-child(34) a") %>%
  html_text()

cas_program_urls <-
  timeschd_url %>%
  str_c(qrt_year) %>%
  read_html() %>%
  html_nodes("ul:nth-child(34) a") %>%
  html_attr('href')

# note that we may miss some things like JSIS A
cas_program_names_and_urls <-
tibble(cas_programs = cas_programs,
       cas_prefix = as.vector(str_match(cas_programs, "[A-Z]{3,4}")),
       cas_program_urls = str_c(timeschd_url, qrt_year, "/", cas_program_urls)) %>%
  drop_na(cas_prefix)

# get time schedule classes for prefixes listed on the dsm website

get_stuff <- function(x){
  read_html(x) %>%
    html_nodes("br+ table a:nth-child(1)") %>%
    html_text() %>%
    str_squish()
}

get_stuff_safe <- safely(get_stuff, otherwise = NA_real_)

# this will take a few seconds to get all the courses on the current time schedule
courses_from_timeschd <-
map(cas_program_names_and_urls$cas_program_urls,
    ~.x %>%
      get_stuff_safe()
    )

cas_program_names_and_urls_offered <-
cas_program_names_and_urls %>%
  mutate(courses_from_timeschd_result =
  transpose(courses_from_timeschd)[["result"]])

# combine into data frame -----------------------------------------------

# make a table of course prefix, number, URL to course catalog,
# and time schedule courses
dsm_courses_prefix_and_number <-
  tibble(dsm_courses_prefix = dsm_courses_prefix,
         dsm_courses_number = dsm_courses_number,
         dsm_crscat_url = dsm_crscat_url) %>%
  mutate(dsm_course_linked_text =
           str_c("<a href='", dsm_crscat_url, "'>",
                 dsm_courses_prefix, " ",
                 dsm_courses_number, "</a>")) %>%
  left_join(cas_program_names_and_urls_offered,
            by = c("dsm_courses_prefix" = "cas_prefix"))

# check to see if the dsm course is offered in the term?
dsm_courses_prefix_and_number_offered <-
dsm_courses_prefix_and_number %>%
  mutate(course_offered_yes_no =
           str_detect(courses_from_timeschd_result,
                      dsm_courses_number))

dsm_courses_prefix_and_number_offered %>%
  filter(course_offered_yes_no) %>%
  distinct(dsm_course_linked_text, .keep_all = TRUE) %>%
  select(dsm_courses_prefix,
         dsm_courses_number,
         dsm_crscat_url,
         dsm_course_linked_text)

	library(tidyverse)
	library(rvest)

	# what quarter and year do we want to check for the availability of DSM courses?
	qrt_year <- "AUT2020"

	# this is the URL to our canonical list of DSM courses
	webpage <- "https://www.washington.edu/uaa/advising/single-pages/data-science-minor/"

	# scrape text from DSM list ---------------------------------------
	dsm_courses <-
	webpage %>%
	read_html() %>%
	html_nodes("blockquote li") %>%
	html_text()

	# tidy DSM course list --------------------------------------------

	# get course prefix
	dsm_courses_prefix <-
	dsm_courses %>%
	str_match("[A-Z]+") %>%
	as.vector()

	# get course number
	dsm_courses_number <-
	dsm_courses %>%
	str_match("[0-9]+") %>%
	as.vector()

	# search course catalog & time schedule ----------------------------

	# search course catalog to get URL
	crscat_url <- "https://www.washington.edu/students/crscat/"
	dsm_crscat_url <- str_c(crscat_url, str_to_lower(dsm_courses_prefix), ".html")

	# get URL to time schedule
	timeschd_url <- "https://www.washington.edu/students/timeschd/"

	# the timeschd_url prefixes are not identical to the course prefixes :(
	cas_programs <-
	timeschd_url %>%
	str_c(qrt_year) %>%
	read_html() %>%
	html_nodes("ul:nth-child(34) a") %>%
	html_text()

	cas_program_urls <-
	timeschd_url %>%
	str_c(qrt_year) %>%
	read_html() %>%
	html_nodes("ul:nth-child(34) a") %>%
	html_attr('href')

	# note that we may miss some things like JSIS A
	cas_program_names_and_urls <-
	tibble(cas_programs = cas_programs,
	cas_prefix = as.vector(str_match(cas_programs, "[A-Z]{3,4}")),
	cas_program_urls = str_c(timeschd_url, qrt_year, "/", cas_program_urls)) %>%
	drop_na(cas_prefix)

	# get time schedule classes for prefixes listed on the dsm website

	get_stuff <- function(x){
	read_html(x) %>%
	html_nodes("br+ table a:nth-child(1)") %>%
	html_text() %>%
	str_squish()
	}

	get_stuff_safe <- safely(get_stuff, otherwise = NA_real_)

	# this will take a few seconds to get all the courses on the current time schedule
	courses_from_timeschd <-
	map(cas_program_names_and_urls$cas_program_urls,
	~.x %>%
	get_stuff_safe()
	)

	cas_program_names_and_urls_offered <-
	cas_program_names_and_urls %>%
	mutate(courses_from_timeschd_result =
	transpose(courses_from_timeschd)[["result"]])

	# combine into data frame -----------------------------------------------

	# make a table of course prefix, number, URL to course catalog,
	# and time schedule courses
	dsm_courses_prefix_and_number <-
	tibble(dsm_courses_prefix = dsm_courses_prefix,
	dsm_courses_number = dsm_courses_number,
	dsm_crscat_url = dsm_crscat_url) %>%
	mutate(dsm_course_linked_text =
	str_c("<a href='", dsm_crscat_url, "'>",
	dsm_courses_prefix, " ",
	dsm_courses_number, "</a>")) %>%
	left_join(cas_program_names_and_urls_offered,
	by = c("dsm_courses_prefix" = "cas_prefix"))

	# check to see if the dsm course is offered in the term?
	dsm_courses_prefix_and_number_offered <-
	dsm_courses_prefix_and_number %>%
	mutate(course_offered_yes_no =
	str_detect(courses_from_timeschd_result,
	dsm_courses_number))

	dsm_courses_prefix_and_number_offered %>%
	filter(course_offered_yes_no) %>%
	distinct(dsm_course_linked_text, .keep_all = TRUE) %>%
	select(dsm_courses_prefix,
	dsm_courses_number,
	dsm_crscat_url,
	dsm_course_linked_text)