zkamvar/scrape-workshops.R

## scrape-workshops.R
library("jsonlite")
library("polite")
library("rvest")
library("purrr")
library("dplyr")


workbench_slugs <- c(
  "r-ecology-lesson",
  "r-socialsci",
  "r-raster-vector-geospatial",
  "lc-shell",
  "instructor-training",
  "python-ecology-es"
)

#' Extract a data frame of lessons from a workshop website
#'
#' @param url the URL to a workshop website
#' @return a data frame with two columns
#'   - url: the url of a lesson or resource
#'   - name: the name of the lesson or resource
get_schedule <- function(url) {
  # open the session
  session <- polite::bow(url)
  # get the HTML
  butter <- polite::scrape(session)
  # extract the schedule if available
  sched <- rvest::html_nodes(butter, "#schedule + table a")
  if (length(sched) == 0) {
    # all links in table headers
     sched <- rvest::html_nodes(butter, "#schedule + h3 a")
  }
  if (length(sched) == 0) {
    # all links in paragraphs
    sched <- rvest::html_nodes(butter, "#schedule + p a") # select all links below a paragraph
  }
  # return a data frame with the URL and name of the lesson
  data.frame(
    url = rvest::html_attr(sched, "href"),
    name = rvest::html_text(sched))
}

#' Extract the slug from a URL
#'
#' @param url a URL
#' @return the URL slug in lowercase format
get_url_slug <- function(url) {
  xml2::url_parse(url)$path |>
    tolower() |>
    strsplit("/") |>
    purrr::map_chr(2)
}

# get the upcoming workshops
upcoming <- read_json("https://feeds.carpentries.org/all_upcoming_workshops.json")
urls <- map_chr(upcoming, "url")
names(urls) <- map_chr(upcoming, "slug")
names(upcoming) <- names(urls)

# create the link table
lessons <- map_dfr(urls, get_schedule, .id = "slug")

# find out if we have any lessons that are upcoming that use the workbench
res <- lessons |>
  mutate(path = get_url_slug(url)) |>
  filter(path %in% workbench_slugs)

# TODO: extract names and find in AMY
	library("jsonlite")
	library("polite")
	library("rvest")
	library("purrr")
	library("dplyr")


	workbench_slugs <- c(
	"r-ecology-lesson",
	"r-socialsci",
	"r-raster-vector-geospatial",
	"lc-shell",
	"instructor-training",
	"python-ecology-es"
	)

	#' Extract a data frame of lessons from a workshop website
	#'
	#' @param url the URL to a workshop website
	#' @return a data frame with two columns
	#' - url: the url of a lesson or resource
	#' - name: the name of the lesson or resource
	get_schedule <- function(url) {
	# open the session
	session <- polite::bow(url)
	# get the HTML
	butter <- polite::scrape(session)
	# extract the schedule if available
	sched <- rvest::html_nodes(butter, "#schedule + table a")
	if (length(sched) == 0) {
	# all links in table headers
	sched <- rvest::html_nodes(butter, "#schedule + h3 a")
	}
	if (length(sched) == 0) {
	# all links in paragraphs
	sched <- rvest::html_nodes(butter, "#schedule + p a") # select all links below a paragraph
	}
	# return a data frame with the URL and name of the lesson
	data.frame(
	url = rvest::html_attr(sched, "href"),
	name = rvest::html_text(sched))
	}

	#' Extract the slug from a URL
	#'
	#' @param url a URL
	#' @return the URL slug in lowercase format
	get_url_slug <- function(url) {
	xml2::url_parse(url)$path \|>
	tolower() \|>
	strsplit("/") \|>
	purrr::map_chr(2)
	}

	# get the upcoming workshops
	upcoming <- read_json("https://feeds.carpentries.org/all_upcoming_workshops.json")
	urls <- map_chr(upcoming, "url")
	names(urls) <- map_chr(upcoming, "slug")
	names(upcoming) <- names(urls)

	# create the link table
	lessons <- map_dfr(urls, get_schedule, .id = "slug")

	# find out if we have any lessons that are upcoming that use the workbench
	res <- lessons \|>
	mutate(path = get_url_slug(url)) \|>
	filter(path %in% workbench_slugs)

	# TODO: extract names and find in AMY