Last active
August 21, 2021 14:27
-
-
Save dbrby/cab614288b998a5157fa6a87145969f2 to your computer and use it in GitHub Desktop.
Scrape all legislative speeches from Hansard
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require(purrr) | |
require(tidyverse) | |
elect_2010 <- as.Date("2010-05-08") # Storing for parameters on future API calls | |
x <- seq.Date(elect_2010, Sys.Date(), "day") # Vector of days between general elections | |
front_url <- "https://hansard.parliament.uk/commons" #Base URL for website + Commons | |
day_urls <- paste0(front_url, "/", x) #Generate URLs for each day | |
day_htmls <- pblapply(day_urls, read_html) #Read in HTMLs for days | |
debate_links <- pblapply(day_htmls, function(lg) { | |
lg %>% | |
html_elements('.card-section') %>% | |
html_attr('href') | |
}) # Write function linkgetter and apply to all files | |
debate_links <- debate_links %>% unlist() #Unlist and store as a vector | |
debate_links <- paste0("https://hansard.parliament.uk", | |
debate_links) | |
debate_htmls <- pblapply(debate_links, read_html) #Read in debate HTMLs | |
out <- debate_htmls %>% map_df(~{ | |
mnis_id <- pg %>% html_elements(., "[class = 'attributed-to-details with-link']") %>% | |
html_attr("href") %>% parse_number() %>% as.character() | |
speakers <- pg %>% html_elements(., ".primary-text") %>% | |
html_text() %>% replace_white() %>% trimws() %>% | |
str_subset("Several", negate = TRUE) | |
text <- pg %>% html_elements(., ".content") %>% | |
html_text() %>% replace_white() %>% trimws() | |
tibble(text, speaker, mnis_id) | |
}) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment