machinatoonist/scrape_pm_media.R

## scrape_pm_media.R
# How to analyse 2.2 million words from 786 different speeches and interviews
# by the Prime Minister of Australia from Jan 2020- July 2021.  After hearing
# one of the PM's speeches I began to wonder if transcripts of all his
# speeches and interviews were publicly available.  It turns out they are!

# This code extract shows how I scraped the full text of 786 speeches and interviews
# using the #rvest package which comes with tidyverse.

# I recommend @Julia Silge's #Tidytext tools for analysis.

# Enjoy!
# Matt Rosinski:
# - Twitter: @machinatoonist
# - LinkedIn: https://www.linkedin.com/feed/update/urn:li:activity:6822642493830631424/


# LIBRARIES ----

library(tidyverse) # Main Package - Loads dplyr, purrr, rvest, stringr, tidyr, forcats
library(tidytext)
library(lubridate)

# Press releases to scrape ----

media <- read_html("https://www.pm.gov.au/media/")

# Setup for collecting all links to media pages ----
media_pages <- tibble(page_num = 0:108) %>%
    mutate(page = paste0("https://www.pm.gov.au/media?page=",page_num,"/"))

# Recommend using Google Chrome Selector Gadget for identifying html nodes to target

# Create a function to get links for the articles/pages to scrape ----
get_link_df <- function (page) {

    content <- rvest::read_html(page)

    title <- content %>%
        rvest::html_nodes(".media-title a") %>%
        rvest::html_text()

    date <- content %>%
        html_nodes(".date-display-single") %>%
        html_text()

    type <- content %>%
        html_nodes(".media-type") %>%
        html_text()

    content %>%
        html_nodes(".media-title a") %>%
        html_attr(name = "href") %>%
        as_tibble() %>%
        mutate(value = paste0("https://www.pm.gov.au",value)) %>%
        mutate(title = title,
               date = date,
               type = type) %>%
        relocate(title, date, type)
}

# Test Function ----
return <- get_link_df("https://www.pm.gov.au/media?page=0")

page = "https://www.pm.gov.au/media?page=0"

view_links <- get_link_df(page)

# Run Query (long running) ----
# Using the map function from purrr
all_pm_media_blurbs <- media_pages %>%
    mutate(link_df = map(page, get_link_df))

# Unnest media blurbs ----
pm_media_unnested <- all_pm_media_blurbs %>%
    unnest(cols = link_df)

pm_media_unnested %>% glimpse()

write_csv(pm_media_unnested, "pm_media_blurbs.csv")
pm_media_unnested <- read_csv("pm_media_blurbs.csv")

# Scrape the links ----

# Testing
link <- "https://www.pm.gov.au/media/press-conference-kirribilli-nsw-7"

get_text <- read_html(link)

content <- get_text %>%
    html_nodes("#block-system-main p") %>%
    html_text(trim = TRUE) %>%
    as_tibble()

content %>%
    mutate(linenumber = row_number(),
           sequence = cumsum(str_detect(value, pattern = ":"))) %>%
    separate(value, c("speaker", "text"), ":", extra = "merge", fill = "left") %>%
    fill(speaker, .direction = "downup")

# Create a function to scrape text ----

# library(progress)
# Using the progress bar caused an error for me so commented out
# pb <- progress_bar$new(total = nrow(pm_media_unnested),
#                        format = "executing [:bar] :percent eta::eta")

all_pm_transcripts <- function(page) {

    # pb$tick()
    # Sys.sleep(1/100)

    get_text <- read_html(page)

    content <- get_text %>%
        html_nodes("#block-system-main p") %>%
        html_text(trim = TRUE) %>%
        as_tibble()

    content %>%
        mutate(linenumber = row_number(),
               sequence = cumsum(str_detect(value, pattern = ":"))) %>%
        separate(value, c("speaker", "text"), ":", extra = "merge", fill = "left") %>%
        fill(speaker, .direction = "downup")

    # pb$terminate()
    # invisible()

}

# Scrape the actual text ----
all_pm_transcripts(link)

test_extract <- pm_media_unnested %>%
    slice_head(n = 2) %>%
    mutate(extract = map(value, all_pm_transcripts)) %>%
    unnest(extract) %>%
    mutate(accessed = Sys.Date())

test_extract %>% glimpse()

{
pm_text_extract <- pm_media_unnested %>%
    # slice_head(n = 5) %>%  # Use slice for checking output
    mutate(extract = map(value, all_pm_transcripts))

pm_text_extract_flat_df <- pm_text_extract %>%
    unnest(extract) %>%
    mutate(accessed = Sys.Date())

# Save extracted data ----
write_csv(pm_text_extract_flat_df, "pm_text_extract.csv")
}

# Preliminary analysis and cleaning ----
pm_text_extract_flat_df %>%
    count(type, sort = TRUE)

pm_text_extract_flat_df %>% glimpse()

pm_text_extract_flat_df %>%
    count(page, sort = TRUE)

pm_text_extract_flat_df %>%
    count(speaker, sort = TRUE)

dataset <- pm_text_extract_flat_df %>%
    filter(!is.na(speaker)) %>%
    filter(!(speaker %in% c("Key projects to be funded include", "They are these"))) %>%
    select(-c(page_num, page)) %>%
    mutate(date = lubridate::dmy(date))

dataset %>%
    count(speaker, sort = TRUE)

dataset %>% glimpse()

dataset %>%
    count(value, sort = TRUE)

write_csv(dataset, "tidy_pm_dataset.csv")
	# How to analyse 2.2 million words from 786 different speeches and interviews
	# by the Prime Minister of Australia from Jan 2020- July 2021. After hearing
	# one of the PM's speeches I began to wonder if transcripts of all his
	# speeches and interviews were publicly available. It turns out they are!

	# This code extract shows how I scraped the full text of 786 speeches and interviews
	# using the #rvest package which comes with tidyverse.

	# I recommend @Julia Silge's #Tidytext tools for analysis.

	# Enjoy!
	# Matt Rosinski:
	# - Twitter: @machinatoonist
	# - LinkedIn: https://www.linkedin.com/feed/update/urn:li:activity:6822642493830631424/


	# LIBRARIES ----

	library(tidyverse) # Main Package - Loads dplyr, purrr, rvest, stringr, tidyr, forcats
	library(tidytext)
	library(lubridate)

	# Press releases to scrape ----

	media <- read_html("https://www.pm.gov.au/media/")

	# Setup for collecting all links to media pages ----
	media_pages <- tibble(page_num = 0:108) %>%
	mutate(page = paste0("https://www.pm.gov.au/media?page=",page_num,"/"))

	# Recommend using Google Chrome Selector Gadget for identifying html nodes to target

	# Create a function to get links for the articles/pages to scrape ----
	get_link_df <- function (page) {

	content <- rvest::read_html(page)

	title <- content %>%
	rvest::html_nodes(".media-title a") %>%
	rvest::html_text()

	date <- content %>%
	html_nodes(".date-display-single") %>%
	html_text()

	type <- content %>%
	html_nodes(".media-type") %>%
	html_text()

	content %>%
	html_nodes(".media-title a") %>%
	html_attr(name = "href") %>%
	as_tibble() %>%
	mutate(value = paste0("https://www.pm.gov.au",value)) %>%
	mutate(title = title,
	date = date,
	type = type) %>%
	relocate(title, date, type)
	}

	# Test Function ----
	return <- get_link_df("https://www.pm.gov.au/media?page=0")

	page = "https://www.pm.gov.au/media?page=0"

	view_links <- get_link_df(page)

	# Run Query (long running) ----
	# Using the map function from purrr
	all_pm_media_blurbs <- media_pages %>%
	mutate(link_df = map(page, get_link_df))

	# Unnest media blurbs ----
	pm_media_unnested <- all_pm_media_blurbs %>%
	unnest(cols = link_df)

	pm_media_unnested %>% glimpse()

	write_csv(pm_media_unnested, "pm_media_blurbs.csv")
	pm_media_unnested <- read_csv("pm_media_blurbs.csv")

	# Scrape the links ----

	# Testing
	link <- "https://www.pm.gov.au/media/press-conference-kirribilli-nsw-7"

	get_text <- read_html(link)

	content <- get_text %>%
	html_nodes("#block-system-main p") %>%
	html_text(trim = TRUE) %>%
	as_tibble()

	content %>%
	mutate(linenumber = row_number(),
	sequence = cumsum(str_detect(value, pattern = ":"))) %>%
	separate(value, c("speaker", "text"), ":", extra = "merge", fill = "left") %>%
	fill(speaker, .direction = "downup")

	# Create a function to scrape text ----

	# library(progress)
	# Using the progress bar caused an error for me so commented out
	# pb <- progress_bar$new(total = nrow(pm_media_unnested),
	# format = "executing [:bar] :percent eta::eta")

	all_pm_transcripts <- function(page) {

	# pb$tick()
	# Sys.sleep(1/100)

	get_text <- read_html(page)

	content <- get_text %>%
	html_nodes("#block-system-main p") %>%
	html_text(trim = TRUE) %>%
	as_tibble()

	content %>%
	mutate(linenumber = row_number(),
	sequence = cumsum(str_detect(value, pattern = ":"))) %>%
	separate(value, c("speaker", "text"), ":", extra = "merge", fill = "left") %>%
	fill(speaker, .direction = "downup")

	# pb$terminate()
	# invisible()

	}

	# Scrape the actual text ----
	all_pm_transcripts(link)

	test_extract <- pm_media_unnested %>%
	slice_head(n = 2) %>%
	mutate(extract = map(value, all_pm_transcripts)) %>%
	unnest(extract) %>%
	mutate(accessed = Sys.Date())

	test_extract %>% glimpse()

	{
	pm_text_extract <- pm_media_unnested %>%
	# slice_head(n = 5) %>% # Use slice for checking output
	mutate(extract = map(value, all_pm_transcripts))

	pm_text_extract_flat_df <- pm_text_extract %>%
	unnest(extract) %>%
	mutate(accessed = Sys.Date())

	# Save extracted data ----
	write_csv(pm_text_extract_flat_df, "pm_text_extract.csv")
	}

	# Preliminary analysis and cleaning ----
	pm_text_extract_flat_df %>%
	count(type, sort = TRUE)

	pm_text_extract_flat_df %>% glimpse()

	pm_text_extract_flat_df %>%
	count(page, sort = TRUE)

	pm_text_extract_flat_df %>%
	count(speaker, sort = TRUE)

	dataset <- pm_text_extract_flat_df %>%
	filter(!is.na(speaker)) %>%
	filter(!(speaker %in% c("Key projects to be funded include", "They are these"))) %>%
	select(-c(page_num, page)) %>%
	mutate(date = lubridate::dmy(date))

	dataset %>%
	count(speaker, sort = TRUE)

	dataset %>% glimpse()

	dataset %>%
	count(value, sort = TRUE)

	write_csv(dataset, "tidy_pm_dataset.csv")