Skip to content

Instantly share code, notes, and snippets.

@dbrby
Created August 19, 2021 02:42
Show Gist options
  • Save dbrby/cf9c8a39bdf1294336933a2cfe45278f to your computer and use it in GitHub Desktop.
Save dbrby/cf9c8a39bdf1294336933a2cfe45278f to your computer and use it in GitHub Desktop.
require(rvest)
require(purrr)
require(textclean)
require(tidyverse)
require(hansard)
require(quanteda)
require(quanteda.textmodels)
require(quanteda.textplots)
pg <- read_html("https://hansard.parliament.uk/commons/2021-08-18/debates/A86142BD-A204-4BC8-BBC0-ACA7BAD7E9F0/Afghanistan")
speaker_link <- pg %>% html_elements("[class = 'attributed-to-details with-link']") %>%
html_attr("href") %>% parse_number() %>% as.character()
speakers <- pg %>% html_elements(".primary-text") %>%
html_text() %>% replace_white() %>% trimws()
text <- pg %>% html_elements(".content") %>%
html_text() %>% replace_white() %>% trimws()
df <- tibble(speakers, text)
df <- filter(df, speakers != "Several hon. Members rose—")
df$member_id <- speaker_link
mps <- mnis::mnis_mps_on_date(date1 = Sys.Date())
df <- left_join(df, mps, by = "member_id")
df <- filter(df, list_as != "Hoyle, Sir Lindsay")
df_corp <- corpus(df, text_field = "text") %>%
corpus_group(groups = member_id)
df <- convert(df_corp, to = "data.frame")
df_corp <- corpus(df, text_field = "text",
docid_field = "list_as")
df_toks <- tokens(df_corp, remove_punct = T,
remove_symbols = T, remove_separators = T,
remove_numbers = T) %>% tokens_remove(stopwords("en"))
df_dfm <- dfm(df_toks) %>% dfm_trim(min_docfreq = 2)
tmod_wf <- textmodel_wordfish(df_dfm, dir = c(3, 1))
textplot_scale1d(tmod_wf, groups = df_corp$party_text)
textplot_scale1d(tmod_wf, margin = "features",
highlighted = c("burden", "sorry", "defeat",
"europe", "order", "uk", "role",
"asylum", "belief", "relocation",
"taliban", "persecuted", "resettle", "export",
"demand"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment