Skip to content

Instantly share code, notes, and snippets.

@zkamvar
Created November 8, 2022 15:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zkamvar/bbca87e6053fbb43449cf7f7cb8f3e5f to your computer and use it in GitHub Desktop.
Save zkamvar/bbca87e6053fbb43449cf7f7cb8f3e5f to your computer and use it in GitHub Desktop.
Scrape lessons from workshops and determine if they have workbench lessons
library("jsonlite")
library("polite")
library("rvest")
library("purrr")
library("dplyr")
workbench_slugs <- c(
"r-ecology-lesson",
"r-socialsci",
"r-raster-vector-geospatial",
"lc-shell",
"instructor-training",
"python-ecology-es"
)
#' Extract a data frame of lessons from a workshop website
#'
#' @param url the URL to a workshop website
#' @return a data frame with two columns
#' - url: the url of a lesson or resource
#' - name: the name of the lesson or resource
get_schedule <- function(url) {
# open the session
session <- polite::bow(url)
# get the HTML
butter <- polite::scrape(session)
# extract the schedule if available
sched <- rvest::html_nodes(butter, "#schedule + table a")
if (length(sched) == 0) {
# all links in table headers
sched <- rvest::html_nodes(butter, "#schedule + h3 a")
}
if (length(sched) == 0) {
# all links in paragraphs
sched <- rvest::html_nodes(butter, "#schedule + p a") # select all links below a paragraph
}
# return a data frame with the URL and name of the lesson
data.frame(
url = rvest::html_attr(sched, "href"),
name = rvest::html_text(sched))
}
#' Extract the slug from a URL
#'
#' @param url a URL
#' @return the URL slug in lowercase format
get_url_slug <- function(url) {
xml2::url_parse(url)$path |>
tolower() |>
strsplit("/") |>
purrr::map_chr(2)
}
# get the upcoming workshops
upcoming <- read_json("https://feeds.carpentries.org/all_upcoming_workshops.json")
urls <- map_chr(upcoming, "url")
names(urls) <- map_chr(upcoming, "slug")
names(upcoming) <- names(urls)
# create the link table
lessons <- map_dfr(urls, get_schedule, .id = "slug")
# find out if we have any lessons that are upcoming that use the workbench
res <- lessons |>
mutate(path = get_url_slug(url)) |>
filter(path %in% workbench_slugs)
# TODO: extract names and find in AMY
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment