Extract all press releases, speeches and statements issued by the European Commission with R and castarter
## Install castarter (devtools required for installing from github) | |
# install.packages("devtools") | |
devtools::install_github(repo = "giocomai/castarter", ref = "development") | |
setwd("~/R") | |
## Load castarter | |
library("castarter") | |
## Set project and website name | |
SetCastarter(project = "EuropeanUnion", website = "EuropeanCommission") | |
## Create folder structure | |
CreateFolders() | |
## Preliminarily save environment | |
SaveWebsite(saveEnvironment = TRUE, dataset = FALSE) | |
## Remember to save also the R file, ideally with the same file name as the .RData that has just been saved | |
## Create indexLinks | |
indexLinks <- CreateLinks( | |
linkFirstChunk = "http://europa.eu/rapid/search-result.htm?page=", | |
linkSecondChunk = "&format=HTML&type=IP&type=STATEMENT&type=SPEECH&type=AC&size=10&locale=EN", | |
startPage = 1, | |
endPage = 5877, | |
increaseBy = 1) | |
## Download indexLinks and re-download files of oddly small size. | |
DownloadContents(links = indexLinks, type = "index") | |
DownloadContents(links = indexLinks, type = "index", missingArticles = FALSE) | |
## Extract articlesLinks | |
articlesLinks <- ExtractLinks(domain = "http://europa.eu", | |
partOfLink = "rapid/press-release_") | |
articlesLinks <- articlesLinks[grepl(pattern = "_en.htm", x = articlesLinks)] | |
## Download articles | |
DownloadContents(links = articlesLinks, type = "articles", createScript = FALSE) | |
DownloadContents(links = articlesLinks, type = "articles", missingArticles = FALSE, size = 15000) | |
articlesHtml <- ImportHtml(from = "articles") | |
## Check a random article online | |
# browseURL(url = sample(x = articlesLinks, size = 1)) | |
## Extract titles | |
titles <- ExtractTitles(articlesHtml = articlesHtml, | |
links = articlesLinks, | |
method = "htmlTitle", | |
removeString = "European Commission - PRESS RELEASES - Press release - ") | |
# head(titles) | |
# project <- "EuropeanUnion" | |
# website <- "EuropeanCommission" | |
library(rvest) | |
library(purrr) | |
# htmlList <- list.files(path = file.path(project, website, "Html"), full.names = TRUE) | |
# htmlList <- htmlList[stringr::str_extract(string = htmlList, pattern = "[[:digit:]]+[[:punct:]]html") %>% stringr::str_sub(start = 1L, end = -6L) %>% as.integer() %>% order()] | |
pb <- progress_estimated(n = length(articlesHtml)) | |
datesTxt <- purrr::map_chr(.x = articlesHtml, .f = function(x, .pb = NULL) { | |
if ((!is.null(.pb)) && inherits(.pb, "Progress") && (.pb$i < .pb$n)) .pb$tick()$print() | |
xml2::read_html(x) %>% | |
rvest::html_nodes("meta[name=Date]") %>% | |
rvest::html_attr('content') | |
}, .pb=pb) | |
## Extract dates | |
dates <- ExtractDates(articlesHtml = datesTxt, | |
dateFormat = "dmY") | |
## Check how many dates have not been retrieved | |
sum(is.na(dates)) | |
## Extract ID | |
articlesId <- ExtractId() | |
## Set language | |
language <- "english" | |
## Export metadata | |
metadata <- ExportMetadata(dates = dates, | |
id = articlesId, | |
titles = titles, | |
language = language, | |
links = articlesLinks, | |
exportXlsx = TRUE) | |
## Extract text | |
contents <- ExtractTxt(articlesHtml = articlesHtml, | |
metadata = metadata, | |
divId = "contentPressRelease", | |
removeEverythingAfter = "Contacts :") | |
i <- sample(x = 1:length(articlesLinks), 1) | |
titles[i] | |
dates[i] | |
contents[i] | |
# browseURL(url = articlesLinks[i]) | |
## Saves environment and export datasets to dedicated folder | |
SaveWebsite(saveEnvironment = TRUE, dataset = TRUE) | |
## Archive folders | |
ArchiveFolders(removeArchivedFolders = TRUE) | |
#RestoreArchives(html = TRUE) | |
#### End of website download ### | |
## export dataset | |
project <- "EuropeanUnion" | |
website <- "EuropeanCommission" | |
saveRDS(object = LoadDatasets() %>% mutate(date=as.Date(date)), file = "dataset.rds") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment