Skip to content

Instantly share code, notes, and snippets.

@njtierney
Created October 19, 2023 16:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save njtierney/d259996174fcffe992592414f80571ed to your computer and use it in GitHub Desktop.
Save njtierney/d259996174fcffe992592414f80571ed to your computer and use it in GitHub Desktop.
library(polite)
library(tidyverse)
library(httr2)
library(rvest)
url <- "https://njt.micro.blog/2023/08/19/pct-day-kennedy.html"
extract_pct_summary <- function(url){
raw <- bow(url) %>% scrape()
raw %>%
html_elements("ul") %>%
pluck(1) %>%
html_text2() %>%
str_split_1("\n") %>%
as_tibble() %>%
tidyr::separate_wider_delim(cols = value,
names = c("variable", "value"),
delim = ":") %>%
pivot_wider(names_from = "variable",
values_from = "value",
names_repair = janitor::make_clean_names)
}
extract_pct_summary(url)
archive_links <- function(){
url_archive <- "https://njt.micro.blog/archive/"
raw_archive <- bow(url_archive) %>% scrape()
raw_archive %>%
html_elements("p.h-entry") %>%
html_elements("a.u-url") %>%
html_attr("href") %>%
rev()
}
urls <- archive_links()
urls
# The first 7 entries don't use the formatting that I
# wanted so I'll skip them
urls_old <- urls[1:7]
urls_new <- urls[8:length(urls)]
urls_new[1] %>% extract_pct_summary()
urls_extracted <- map(urls_new, extract_pct_summary)
urls_extracted
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment