Skip to content

Instantly share code, notes, and snippets.

@dbrby
Last active August 21, 2021 14:27
Show Gist options
  • Save dbrby/cab614288b998a5157fa6a87145969f2 to your computer and use it in GitHub Desktop.
Save dbrby/cab614288b998a5157fa6a87145969f2 to your computer and use it in GitHub Desktop.
Scrape all legislative speeches from Hansard
require(purrr)
require(tidyverse)
elect_2010 <- as.Date("2010-05-08") # Storing for parameters on future API calls
x <- seq.Date(elect_2010, Sys.Date(), "day") # Vector of days between general elections
front_url <- "https://hansard.parliament.uk/commons" #Base URL for website + Commons
day_urls <- paste0(front_url, "/", x) #Generate URLs for each day
day_htmls <- pblapply(day_urls, read_html) #Read in HTMLs for days
debate_links <- pblapply(day_htmls, function(lg) {
lg %>%
html_elements('.card-section') %>%
html_attr('href')
}) # Write function linkgetter and apply to all files
debate_links <- debate_links %>% unlist() #Unlist and store as a vector
debate_links <- paste0("https://hansard.parliament.uk",
debate_links)
debate_htmls <- pblapply(debate_links, read_html) #Read in debate HTMLs
out <- debate_htmls %>% map_df(~{
mnis_id <- pg %>% html_elements(., "[class = 'attributed-to-details with-link']") %>%
html_attr("href") %>% parse_number() %>% as.character()
speakers <- pg %>% html_elements(., ".primary-text") %>%
html_text() %>% replace_white() %>% trimws() %>%
str_subset("Several", negate = TRUE)
text <- pg %>% html_elements(., ".content") %>%
html_text() %>% replace_white() %>% trimws()
tibble(text, speaker, mnis_id)
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment