Skip to content

Instantly share code, notes, and snippets.

@dbrby
Last active August 2, 2021 16:38
Show Gist options
  • Save dbrby/88e78ddfa9a058b8eefee4eb9ba63b3a to your computer and use it in GitHub Desktop.
Save dbrby/88e78ddfa9a058b8eefee4eb9ba63b3a to your computer and use it in GitHub Desktop.
Build dataset from legislative speech records from the Fifth Senedd
require(rvest)
require(lubridate)
require(tidyverse)
require(readr)
require(data.table)
require(textclean)
require(XML)
require(lubridate)
require(pbapply)
# https://record.senedd.wales/XMLExport/?start=2016-05-05&end=2017-06-05 (results page)
front <- "https://record.senedd.wales/XMLExport/?start="
mid <- "&end="
dates <- seq(from = as.Date("2016/5/5"), to = Sys.Date(), "month")
links <- paste0(front, dates, mid, dates + months(1))
results_html <- pblapply(links, read_html)
links_xml_recs <- pblapply(results_html, function(get_links) {
get_links %>% html_elements("a") %>% html_attr("href")
})
links_xml_recs <- links_xml_recs %>% unlist() %>% str_subset(pattern = "Download?")
eng_lang <- links_xml_recs %>% str_subset(pattern = "EnglishTranscript") %>%
paste0("https://record.senedd.wales", .)
welsh_lang <- links_xml_recs %>% str_subset(pattern = "WelshTranscript") %>%
paste0("https://record.senedd.wales", .)
path <- "/Users/danbraby/Dropbox/parlCymru/raw_data/"
for (i in 1:length(eng_lang)) {
download.file(url = eng_lang[i], destfile = paste0(path, "english/", i, ".xml"))
}
for (i in 1:length(welsh_lang)) {
download.file(url = welsh_lang[i], destfile = paste0(path, "welsh/", i, ".xml"))
}
eng_files <- list.files(path = paste0(path, "english/"), full.names = TRUE)
welsh_files <- list.files(path = paste0(path, "welsh/"), full.names = TRUE)
eng_xml <- pblapply(eng_files, function(xml_formatter) {
xml_formatter %>% xmlParse() %>% xmlToList()
})
welsh_xml <- pblapply(welsh_files, function(xml_formatter) {
xml_formatter %>% xmlParse() %>% xmlToList()
})
eng_data <- NA
eng_data <- eng_data %>% as.list()
for(i in 1:length(eng_xml)) {
eng_data[[i]] <- eng_xml[[i]][1:length(eng_xml[[i]]) - 1] %>% rbindlist(fill = TRUE)
}
welsh_data <- NA
welsh_data <- welsh_data %>% as.list()
for(i in 1:length(welsh_xml)) {
welsh_data[[i]] <- welsh_xml[[i]][1:length(welsh_xml[[i]]) - 1] %>% rbindlist(fill = TRUE)
}
eng_data <- eng_data %>% bind_rows()
welsh_data <- welsh_data %>% bind_rows()
eng_data$Contribution <- eng_data$Contribution_English %>% replace_html() %>% trimws()
welsh_data$Contribution <- welsh_data$Contribution_Welsh %>% replace_html() %>% trimws()
parlCymru <- eng_data
parlCymru$Contribution_Welsh <- welsh_data$Contribution
parlCymru$is_speech <- NA
parlCymru$is_speech[is.na(parlCymru$Member_name_English)] <- 0
parlCymru$is_speech[is.na(parlCymru$is_speech)] <- 1
parlCymru <- tibble(
meeting_id = parlCymru$Meeting_ID,
date = as.Date(parlCymru$MeetingDate),
contribution_id = parlCymru$Contribution_ID,
order_by_meeting = parlCymru$Contribution_Order_ID + 1,
language = parlCymru$contribution_language,
agenda_item = parlCymru$Agenda_item_english,
member_id = parlCymru$Member_Id,
member_name = parlCymru$Member_name_English,
role = parlCymru$Member_job_title_English,
contribution_eng = parlCymru$Contribution,
contribution_welsh = parlCymru$Contribution_Welsh,
is_speech = parlCymru$is_speech
)
parlCymru <- filter(parlCymruV1, date >= "2016-05-05" & date <= "2021-05-05")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment