How to Create a Large Structured Text Dataset Using R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# How to analyse 2.2 million words from 786 different speeches and interviews | |
# by the Prime Minister of Australia from Jan 2020- July 2021. After hearing | |
# one of the PM's speeches I began to wonder if transcripts of all his | |
# speeches and interviews were publicly available. It turns out they are! | |
# This code extract shows how I scraped the full text of 786 speeches and interviews | |
# using the #rvest package which comes with tidyverse. | |
# I recommend @Julia Silge's #Tidytext tools for analysis. | |
# Enjoy! | |
# Matt Rosinski: | |
# - Twitter: @machinatoonist | |
# - LinkedIn: https://www.linkedin.com/feed/update/urn:li:activity:6822642493830631424/ | |
# LIBRARIES ---- | |
library(tidyverse) # Main Package - Loads dplyr, purrr, rvest, stringr, tidyr, forcats | |
library(tidytext) | |
library(lubridate) | |
# Press releases to scrape ---- | |
media <- read_html("https://www.pm.gov.au/media/") | |
# Setup for collecting all links to media pages ---- | |
media_pages <- tibble(page_num = 0:108) %>% | |
mutate(page = paste0("https://www.pm.gov.au/media?page=",page_num,"/")) | |
# Recommend using Google Chrome Selector Gadget for identifying html nodes to target | |
# Create a function to get links for the articles/pages to scrape ---- | |
get_link_df <- function (page) { | |
content <- rvest::read_html(page) | |
title <- content %>% | |
rvest::html_nodes(".media-title a") %>% | |
rvest::html_text() | |
date <- content %>% | |
html_nodes(".date-display-single") %>% | |
html_text() | |
type <- content %>% | |
html_nodes(".media-type") %>% | |
html_text() | |
content %>% | |
html_nodes(".media-title a") %>% | |
html_attr(name = "href") %>% | |
as_tibble() %>% | |
mutate(value = paste0("https://www.pm.gov.au",value)) %>% | |
mutate(title = title, | |
date = date, | |
type = type) %>% | |
relocate(title, date, type) | |
} | |
# Test Function ---- | |
return <- get_link_df("https://www.pm.gov.au/media?page=0") | |
page = "https://www.pm.gov.au/media?page=0" | |
view_links <- get_link_df(page) | |
# Run Query (long running) ---- | |
# Using the map function from purrr | |
all_pm_media_blurbs <- media_pages %>% | |
mutate(link_df = map(page, get_link_df)) | |
# Unnest media blurbs ---- | |
pm_media_unnested <- all_pm_media_blurbs %>% | |
unnest(cols = link_df) | |
pm_media_unnested %>% glimpse() | |
write_csv(pm_media_unnested, "pm_media_blurbs.csv") | |
pm_media_unnested <- read_csv("pm_media_blurbs.csv") | |
# Scrape the links ---- | |
# Testing | |
link <- "https://www.pm.gov.au/media/press-conference-kirribilli-nsw-7" | |
get_text <- read_html(link) | |
content <- get_text %>% | |
html_nodes("#block-system-main p") %>% | |
html_text(trim = TRUE) %>% | |
as_tibble() | |
content %>% | |
mutate(linenumber = row_number(), | |
sequence = cumsum(str_detect(value, pattern = ":"))) %>% | |
separate(value, c("speaker", "text"), ":", extra = "merge", fill = "left") %>% | |
fill(speaker, .direction = "downup") | |
# Create a function to scrape text ---- | |
# library(progress) | |
# Using the progress bar caused an error for me so commented out | |
# pb <- progress_bar$new(total = nrow(pm_media_unnested), | |
# format = "executing [:bar] :percent eta::eta") | |
all_pm_transcripts <- function(page) { | |
# pb$tick() | |
# Sys.sleep(1/100) | |
get_text <- read_html(page) | |
content <- get_text %>% | |
html_nodes("#block-system-main p") %>% | |
html_text(trim = TRUE) %>% | |
as_tibble() | |
content %>% | |
mutate(linenumber = row_number(), | |
sequence = cumsum(str_detect(value, pattern = ":"))) %>% | |
separate(value, c("speaker", "text"), ":", extra = "merge", fill = "left") %>% | |
fill(speaker, .direction = "downup") | |
# pb$terminate() | |
# invisible() | |
} | |
# Scrape the actual text ---- | |
all_pm_transcripts(link) | |
test_extract <- pm_media_unnested %>% | |
slice_head(n = 2) %>% | |
mutate(extract = map(value, all_pm_transcripts)) %>% | |
unnest(extract) %>% | |
mutate(accessed = Sys.Date()) | |
test_extract %>% glimpse() | |
{ | |
pm_text_extract <- pm_media_unnested %>% | |
# slice_head(n = 5) %>% # Use slice for checking output | |
mutate(extract = map(value, all_pm_transcripts)) | |
pm_text_extract_flat_df <- pm_text_extract %>% | |
unnest(extract) %>% | |
mutate(accessed = Sys.Date()) | |
# Save extracted data ---- | |
write_csv(pm_text_extract_flat_df, "pm_text_extract.csv") | |
} | |
# Preliminary analysis and cleaning ---- | |
pm_text_extract_flat_df %>% | |
count(type, sort = TRUE) | |
pm_text_extract_flat_df %>% glimpse() | |
pm_text_extract_flat_df %>% | |
count(page, sort = TRUE) | |
pm_text_extract_flat_df %>% | |
count(speaker, sort = TRUE) | |
dataset <- pm_text_extract_flat_df %>% | |
filter(!is.na(speaker)) %>% | |
filter(!(speaker %in% c("Key projects to be funded include", "They are these"))) %>% | |
select(-c(page_num, page)) %>% | |
mutate(date = lubridate::dmy(date)) | |
dataset %>% | |
count(speaker, sort = TRUE) | |
dataset %>% glimpse() | |
dataset %>% | |
count(value, sort = TRUE) | |
write_csv(dataset, "tidy_pm_dataset.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment