Extract all press releases, speeches and statements issued by the European Commission with R and castarter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Install castarter (devtools required for installing from github) | |
# install.packages("devtools") | |
devtools::install_github(repo = "giocomai/castarter", ref = "development") | |
setwd("~/R") | |
## Load castarter | |
library("castarter") | |
## Set project and website name | |
SetCastarter(project = "EuropeanUnion", website = "EuropeanCommission") | |
## Create folder structure | |
CreateFolders() | |
## Preliminarily save environment | |
SaveWebsite(saveEnvironment = TRUE, dataset = FALSE) | |
## Remember to save also the R file, ideally with the same file name as the .RData that has just been saved | |
## Create indexLinks | |
indexLinks <- CreateLinks( | |
linkFirstChunk = "http://europa.eu/rapid/search-result.htm?page=", | |
linkSecondChunk = "&format=HTML&type=IP&type=STATEMENT&type=SPEECH&type=AC&size=10&locale=EN", | |
startPage = 1, | |
endPage = 5877, | |
increaseBy = 1) | |
## Download indexLinks and re-download files of oddly small size. | |
DownloadContents(links = indexLinks, type = "index") | |
DownloadContents(links = indexLinks, type = "index", missingArticles = FALSE) | |
## Extract articlesLinks | |
articlesLinks <- ExtractLinks(domain = "http://europa.eu", | |
partOfLink = "rapid/press-release_") | |
articlesLinks <- articlesLinks[grepl(pattern = "_en.htm", x = articlesLinks)] | |
## Download articles | |
DownloadContents(links = articlesLinks, type = "articles", createScript = FALSE) | |
DownloadContents(links = articlesLinks, type = "articles", missingArticles = FALSE, size = 15000) | |
articlesHtml <- ImportHtml(from = "articles") | |
## Check a random article online | |
# browseURL(url = sample(x = articlesLinks, size = 1)) | |
## Extract titles | |
titles <- ExtractTitles(articlesHtml = articlesHtml, | |
links = articlesLinks, | |
method = "htmlTitle", | |
removeString = "European Commission - PRESS RELEASES - Press release - ") | |
# head(titles) | |
# project <- "EuropeanUnion" | |
# website <- "EuropeanCommission" | |
library(rvest) | |
library(purrr) | |
# htmlList <- list.files(path = file.path(project, website, "Html"), full.names = TRUE) | |
# htmlList <- htmlList[stringr::str_extract(string = htmlList, pattern = "[[:digit:]]+[[:punct:]]html") %>% stringr::str_sub(start = 1L, end = -6L) %>% as.integer() %>% order()] | |
pb <- progress_estimated(n = length(articlesHtml)) | |
datesTxt <- purrr::map_chr(.x = articlesHtml, .f = function(x, .pb = NULL) { | |
if ((!is.null(.pb)) && inherits(.pb, "Progress") && (.pb$i < .pb$n)) .pb$tick()$print() | |
xml2::read_html(x) %>% | |
rvest::html_nodes("meta[name=Date]") %>% | |
rvest::html_attr('content') | |
}, .pb=pb) | |
## Extract dates | |
dates <- ExtractDates(articlesHtml = datesTxt, | |
dateFormat = "dmY") | |
## Check how many dates have not been retrieved | |
sum(is.na(dates)) | |
## Extract ID | |
articlesId <- ExtractId() | |
## Set language | |
language <- "english" | |
## Export metadata | |
metadata <- ExportMetadata(dates = dates, | |
id = articlesId, | |
titles = titles, | |
language = language, | |
links = articlesLinks, | |
exportXlsx = TRUE) | |
## Extract text | |
contents <- ExtractTxt(articlesHtml = articlesHtml, | |
metadata = metadata, | |
divId = "contentPressRelease", | |
removeEverythingAfter = "Contacts :") | |
i <- sample(x = 1:length(articlesLinks), 1) | |
titles[i] | |
dates[i] | |
contents[i] | |
# browseURL(url = articlesLinks[i]) | |
## Saves environment and export datasets to dedicated folder | |
SaveWebsite(saveEnvironment = TRUE, dataset = TRUE) | |
## Archive folders | |
ArchiveFolders(removeArchivedFolders = TRUE) | |
#RestoreArchives(html = TRUE) | |
#### End of website download ### | |
## export dataset | |
project <- "EuropeanUnion" | |
website <- "EuropeanCommission" | |
saveRDS(object = LoadDatasets() %>% mutate(date=as.Date(date)), file = "dataset.rds") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment