Skip to content

Instantly share code, notes, and snippets.

@dbrby
Created August 5, 2021 14:03
Show Gist options
  • Save dbrby/e07144735b5f7c555490391c22b34f23 to your computer and use it in GitHub Desktop.
Save dbrby/e07144735b5f7c555490391c22b34f23 to your computer and use it in GitHub Desktop.
require(rvest)
require(pbapply)
require(tidyverse)
require(tidywikidatar)
base_url <- "https://en.wikipedia.org/"
pg <- read_html("https://en.wikipedia.org/wiki/Members_of_the_5th_National_Assembly_for_Wales")
data <- pg %>% html_elements(".wikitable") %>% .[[5]]
df <- data %>% html_table()
## Developing a get link function from wikipedia tables
get_link_table <- function(html_table, class){
html_table %>%
html_nodes(xpath=paste0("//a[text()='", class, "']")) %>%
.[[1]] %>%
html_attr("href")
}
df$MS_link <- sapply(df$Name, function(x)get_link_table(data, x))
welsh_MS <- pblapply(paste0(base_url, df$MS_link), read_html)
wikidata_front <- "https://www.wikidata.org/wiki/Special:EntityPage/"
wiki_data_delete <- "#sitelinks-wikipedia"
welsh_wikidata_links <- lapply(welsh_MS, function(wikidata_grabber) {
wikidata_grabber %>% html_element(".wb-langlinks-link a") %>% html_attr("href") %>%
gsub(paste0(wikidata_front, "|", wiki_data_delete), "", .)
})
df$wikidataid <- unlist(welsh_wikidata_links)
welsh_ids <- tw_get_property(id = df$wikidataid,
p = "P4651",
language = "en")
df <- df[,2:7]
df <- left_join(df, welsh_ids, by = c("wikidataid" = "id"))
df <- df %>% select(-property, -MS_link, -`Senedd Leader`)
df <- df %>% rename(
party = Party,
member_name = Name,
electoral_district = `Constituency or Region`,
member_id = value
)
################################################################################
## Missing Cases
parlCymru <- read_csv("data/parlCymruV0.csv")
pc <- pc %>% distinct(member_id, .keep_all = TRUE)
pc$linked <- pc$member_id %in% df$member_id
pc <- pc %>% select(member_id, member_name, linked)
not_linked <- tibble(
member_name = c("Nathan Gill", "Simon Thomas", "Steffan Lewis",
"Mohammed Asghar", "Carl Sargeant", "Alun Cairns"),
member_id = c(433, 383, 289, 130, 178, 12),
party = c("UKIP", "Plaid Cymru", "Plaid Cymru", "Conservative", "Labour",
"Conservative"),
MS_link = c("https://en.wikipedia.org/wiki/Nathan_Gill",
"https://en.wikipedia.org/wiki/Simon_Thomas_(politician)",
"https://en.wikipedia.org/wiki/Steffan_Lewis",
"https://en.wikipedia.org/wiki/Mohammad_Asghar",
"https://en.wikipedia.org/wiki/Carl_Sargeant",
"https://en.wikipedia.org/wiki/Alun_Cairns"),
electoral_district = c("North Wales", "Mid and West Wales",
"South East Wales", "South Wales East",
"Alyn and Deeside", "South Wales West")
)
not_linked_MS <- pblapply(not_linked$MS_link, read_html)
nl_wikidata_links <- lapply(not_linked_MS, function(wikidata_grabber) {
wikidata_grabber %>% html_element(".wb-langlinks-link a") %>% html_attr("href") %>%
gsub(paste0(wikidata_front, "|", wiki_data_delete), "", .)
})
not_linked$wikidataid <- unlist(nl_wikidata_links)
df$member_id <- as.numeric(df$member_id)
MS <- bind_rows(df, not_linked)
parlCymru <- parlCymru %>% select(-member_name)
parlCymru <- left_join(parlCymru, MS, by = "member_id")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment