Last active
December 8, 2022 23:27
-
-
Save MattCowgill/5f283eb410116c9da32e166639b947c6 to your computer and use it in GitHub Desktop.
A function to scrape the full text of Reserve Bank of Australia monetary policy decisions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rvest) | |
library(tidyverse) | |
#' Scrape RBA monetary policy decision media releases in a tidy tibble | |
#' @param min_year If `NULL` (the default), all releases will be scraped. If a | |
#' year is specified (eg. `2015`), only releases from that year onwards will be | |
#' scraped. | |
#' @author Matt Cowgill | |
#' @examples | |
#' all_decisions <- scrape_monpol_decisions() | |
#' covid_decisions <- scrape_monpol_decisions(2020) | |
scrape_monpol_decisions <- function(min_year = NULL) { | |
monpol_page <- read_html("https://www.rba.gov.au/monetary-policy/") | |
monpol_year_url_fragments <- monpol_page |> | |
html_elements("li:nth-child(5) li li a") |> | |
html_attr("href") | |
monpol_year_urls <- paste0("https://www.rba.gov.au", | |
monpol_year_url_fragments) |> | |
sort(decreasing = TRUE) | |
if (!is.null(min_year)) { | |
years <- gsub("https://www.rba.gov.au/monetary-policy/int-rate-decisions/|/", | |
"", | |
monpol_year_urls) |> | |
as.numeric() | |
monpol_year_urls <- monpol_year_urls[years >= min_year] | |
} | |
get_page_links <- function(url) { | |
date_url_fragments <- read_html(url) |> | |
html_elements(".list-articles a") |> | |
html_attr("href") | |
paste0("https://www.rba.gov.au", | |
date_url_fragments) | |
} | |
get_text_from_mr <- function(url) { | |
page <- url |> | |
read_html() | |
raw_text <- page |> | |
html_elements("div.rss-mr-content") |> | |
html_text2() | |
if (length(raw_text) == 0) { | |
raw_text <- page |> | |
html_elements(".article-data+ div") |> | |
html_text2() | |
} | |
date <- page |> | |
html_elements("time") |> | |
html_text() |> | |
lubridate::dmy() | |
if (length(date) == 0) { | |
date <- page |> | |
html_elements("#content > section > div > div.box-article-info.article-data > div:nth-child(2) > span.value") |> | |
html_text() |> | |
lubridate::dmy() | |
} | |
title <- page |> | |
html_elements("span.rss-mr-title") |> | |
html_text() | |
if (date >= as.Date("2006-11-08")) { | |
statement_by <- gsub(",.*", "", title) | |
author <- gsub("Statement by |Statement By", "", statement_by) | |
} else { | |
statement_by <- gsub(":.*", "", title) | |
author <- gsub("Statement by the Governor, Mr ", "", statement_by) | |
} | |
text <- gsub("\r|\n", " ", raw_text) |> | |
str_squish() | |
tibble(date = date, | |
author = author, | |
text = text) | |
} | |
page_links_list <- map(monpol_year_urls, get_page_links) | |
page_links_long <- unlist(page_links_list) | |
page_links <- page_links_long[page_links_long != "https://www.rba.gov.au"] | |
monpol_decisions <- map_dfr(page_links, get_text_from_mr) | |
return(monpol_decisions) | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment