Skip to content

Instantly share code, notes, and snippets.

@machinatoonist
Last active July 24, 2021 02:11
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save machinatoonist/12510fa9b1f8a0fadf0d293672c1fa3e to your computer and use it in GitHub Desktop.
Save machinatoonist/12510fa9b1f8a0fadf0d293672c1fa3e to your computer and use it in GitHub Desktop.
How to Create a Large Structured Text Dataset Using R
# How to analyse 2.2 million words from 786 different speeches and interviews
# by the Prime Minister of Australia from Jan 2020- July 2021. After hearing
# one of the PM's speeches I began to wonder if transcripts of all his
# speeches and interviews were publicly available. It turns out they are!
# This code extract shows how I scraped the full text of 786 speeches and interviews
# using the #rvest package which comes with tidyverse.
# I recommend @Julia Silge's #Tidytext tools for analysis.
# Enjoy!
# Matt Rosinski:
# - Twitter: @machinatoonist
# - LinkedIn: https://www.linkedin.com/feed/update/urn:li:activity:6822642493830631424/
# LIBRARIES ----
library(tidyverse) # Main Package - Loads dplyr, purrr, rvest, stringr, tidyr, forcats
library(tidytext)
library(lubridate)
# Press releases to scrape ----
media <- read_html("https://www.pm.gov.au/media/")
# Setup for collecting all links to media pages ----
media_pages <- tibble(page_num = 0:108) %>%
mutate(page = paste0("https://www.pm.gov.au/media?page=",page_num,"/"))
# Recommend using Google Chrome Selector Gadget for identifying html nodes to target
# Create a function to get links for the articles/pages to scrape ----
get_link_df <- function (page) {
content <- rvest::read_html(page)
title <- content %>%
rvest::html_nodes(".media-title a") %>%
rvest::html_text()
date <- content %>%
html_nodes(".date-display-single") %>%
html_text()
type <- content %>%
html_nodes(".media-type") %>%
html_text()
content %>%
html_nodes(".media-title a") %>%
html_attr(name = "href") %>%
as_tibble() %>%
mutate(value = paste0("https://www.pm.gov.au",value)) %>%
mutate(title = title,
date = date,
type = type) %>%
relocate(title, date, type)
}
# Test Function ----
return <- get_link_df("https://www.pm.gov.au/media?page=0")
page = "https://www.pm.gov.au/media?page=0"
view_links <- get_link_df(page)
# Run Query (long running) ----
# Using the map function from purrr
all_pm_media_blurbs <- media_pages %>%
mutate(link_df = map(page, get_link_df))
# Unnest media blurbs ----
pm_media_unnested <- all_pm_media_blurbs %>%
unnest(cols = link_df)
pm_media_unnested %>% glimpse()
write_csv(pm_media_unnested, "pm_media_blurbs.csv")
pm_media_unnested <- read_csv("pm_media_blurbs.csv")
# Scrape the links ----
# Testing
link <- "https://www.pm.gov.au/media/press-conference-kirribilli-nsw-7"
get_text <- read_html(link)
content <- get_text %>%
html_nodes("#block-system-main p") %>%
html_text(trim = TRUE) %>%
as_tibble()
content %>%
mutate(linenumber = row_number(),
sequence = cumsum(str_detect(value, pattern = ":"))) %>%
separate(value, c("speaker", "text"), ":", extra = "merge", fill = "left") %>%
fill(speaker, .direction = "downup")
# Create a function to scrape text ----
# library(progress)
# Using the progress bar caused an error for me so commented out
# pb <- progress_bar$new(total = nrow(pm_media_unnested),
# format = "executing [:bar] :percent eta::eta")
all_pm_transcripts <- function(page) {
# pb$tick()
# Sys.sleep(1/100)
get_text <- read_html(page)
content <- get_text %>%
html_nodes("#block-system-main p") %>%
html_text(trim = TRUE) %>%
as_tibble()
content %>%
mutate(linenumber = row_number(),
sequence = cumsum(str_detect(value, pattern = ":"))) %>%
separate(value, c("speaker", "text"), ":", extra = "merge", fill = "left") %>%
fill(speaker, .direction = "downup")
# pb$terminate()
# invisible()
}
# Scrape the actual text ----
all_pm_transcripts(link)
test_extract <- pm_media_unnested %>%
slice_head(n = 2) %>%
mutate(extract = map(value, all_pm_transcripts)) %>%
unnest(extract) %>%
mutate(accessed = Sys.Date())
test_extract %>% glimpse()
{
pm_text_extract <- pm_media_unnested %>%
# slice_head(n = 5) %>% # Use slice for checking output
mutate(extract = map(value, all_pm_transcripts))
pm_text_extract_flat_df <- pm_text_extract %>%
unnest(extract) %>%
mutate(accessed = Sys.Date())
# Save extracted data ----
write_csv(pm_text_extract_flat_df, "pm_text_extract.csv")
}
# Preliminary analysis and cleaning ----
pm_text_extract_flat_df %>%
count(type, sort = TRUE)
pm_text_extract_flat_df %>% glimpse()
pm_text_extract_flat_df %>%
count(page, sort = TRUE)
pm_text_extract_flat_df %>%
count(speaker, sort = TRUE)
dataset <- pm_text_extract_flat_df %>%
filter(!is.na(speaker)) %>%
filter(!(speaker %in% c("Key projects to be funded include", "They are these"))) %>%
select(-c(page_num, page)) %>%
mutate(date = lubridate::dmy(date))
dataset %>%
count(speaker, sort = TRUE)
dataset %>% glimpse()
dataset %>%
count(value, sort = TRUE)
write_csv(dataset, "tidy_pm_dataset.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment