dbrby/snp21_conference_tmod.R

## snp21_conference_tmod.R
pacman::p_load(rvest, purrr, textclean, tidyverse, quanteda,
               quanteda.textplots, quanteda.textmodels, stm)

pg <- read_html("https://www.snp.org/news/?_load_more=2")


links <- pg %>% html_elements("a") %>% html_attr("href") %>%
  str_subset("address-to-snp21") %>% unique()


pages <- lapply(links, read_html)


out <- pages %>% map_df(~{
  title <- html_element(., ".content__offset") %>% html_text()
  speech <- html_element(., ".wysiwyg-ce") %>% html_text() %>% replace_white() %>%
    trimws()
  df <- tibble(title, speech)
})

out$speaker <- gsub("’s address to #SNP21 Conference", "", out$title)
out$speaker <- gsub("’ address to #SNP21 Conference", "", out$speaker)
out$speaker <- gsub("’s full address to #SNP21 Conference", "", out$speaker)

snp_corp <- corpus(out, text_field = "speech", docid_field = "speaker")

snp_toks <- tokens(snp_corp, remove_punct = T, remove_symbols = T, remove_numbers = T,
                   remove_separators = T, remove_url = T) %>%
  tokens_remove(c(stopwords("en"), "welcome", "good", "aftenoon", "morning", "people"))

snp_dfm <- dfm(snp_toks) %>% dfm_trim(min_docfreq = 2)

tmod_wf <- textmodel_wordfish(snp_dfm, dir = c(3, 1))

textplot_scale1d(tmod_wf)


textplot_scale1d(tmod_wf, margin = "features",
                 highlighted = c("parents", "promise", "pledge",
                                 "inequality", "referendum",
                                 "campaign", "europe", "education",
                                 "health", "independence"))
	pacman::p_load(rvest, purrr, textclean, tidyverse, quanteda,
	quanteda.textplots, quanteda.textmodels, stm)

	pg <- read_html("https://www.snp.org/news/?_load_more=2")


	links <- pg %>% html_elements("a") %>% html_attr("href") %>%
	str_subset("address-to-snp21") %>% unique()


	pages <- lapply(links, read_html)


	out <- pages %>% map_df(~{
	title <- html_element(., ".content__offset") %>% html_text()
	speech <- html_element(., ".wysiwyg-ce") %>% html_text() %>% replace_white() %>%
	trimws()
	df <- tibble(title, speech)
	})

	out$speaker <- gsub("’s address to #SNP21 Conference", "", out$title)
	out$speaker <- gsub("’ address to #SNP21 Conference", "", out$speaker)
	out$speaker <- gsub("’s full address to #SNP21 Conference", "", out$speaker)

	snp_corp <- corpus(out, text_field = "speech", docid_field = "speaker")

	snp_toks <- tokens(snp_corp, remove_punct = T, remove_symbols = T, remove_numbers = T,
	remove_separators = T, remove_url = T) %>%
	tokens_remove(c(stopwords("en"), "welcome", "good", "aftenoon", "morning", "people"))

	snp_dfm <- dfm(snp_toks) %>% dfm_trim(min_docfreq = 2)

	tmod_wf <- textmodel_wordfish(snp_dfm, dir = c(3, 1))

	textplot_scale1d(tmod_wf)


	textplot_scale1d(tmod_wf, margin = "features",
	highlighted = c("parents", "promise", "pledge",
	"inequality", "referendum",
	"campaign", "europe", "education",
	"health", "independence"))