Skip to content

Instantly share code, notes, and snippets.

@giocomai
Last active September 27, 2017 19:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save giocomai/e4138290a74eaf94ffc267a90b9998c9 to your computer and use it in GitHub Desktop.
Save giocomai/e4138290a74eaf94ffc267a90b9998c9 to your computer and use it in GitHub Desktop.
Extract all press releases, speeches and statements issued by the European Commission with R and castarter
## Install castarter (devtools required for installing from github)
# install.packages("devtools")
devtools::install_github(repo = "giocomai/castarter", ref = "development")
setwd("~/R")
## Load castarter
library("castarter")
## Set project and website name
SetCastarter(project = "EuropeanUnion", website = "EuropeanCommission")
## Create folder structure
CreateFolders()
## Preliminarily save environment
SaveWebsite(saveEnvironment = TRUE, dataset = FALSE)
## Remember to save also the R file, ideally with the same file name as the .RData that has just been saved
## Create indexLinks
indexLinks <- CreateLinks(
linkFirstChunk = "http://europa.eu/rapid/search-result.htm?page=",
linkSecondChunk = "&format=HTML&type=IP&type=STATEMENT&type=SPEECH&type=AC&size=10&locale=EN",
startPage = 1,
endPage = 5877,
increaseBy = 1)
## Download indexLinks and re-download files of oddly small size.
DownloadContents(links = indexLinks, type = "index")
DownloadContents(links = indexLinks, type = "index", missingArticles = FALSE)
## Extract articlesLinks
articlesLinks <- ExtractLinks(domain = "http://europa.eu",
partOfLink = "rapid/press-release_")
articlesLinks <- articlesLinks[grepl(pattern = "_en.htm", x = articlesLinks)]
## Download articles
DownloadContents(links = articlesLinks, type = "articles", createScript = FALSE)
DownloadContents(links = articlesLinks, type = "articles", missingArticles = FALSE, size = 15000)
articlesHtml <- ImportHtml(from = "articles")
## Check a random article online
# browseURL(url = sample(x = articlesLinks, size = 1))
## Extract titles
titles <- ExtractTitles(articlesHtml = articlesHtml,
links = articlesLinks,
method = "htmlTitle",
removeString = "European Commission - PRESS RELEASES - Press release - ")
# head(titles)
# project <- "EuropeanUnion"
# website <- "EuropeanCommission"
library(rvest)
library(purrr)
# htmlList <- list.files(path = file.path(project, website, "Html"), full.names = TRUE)
# htmlList <- htmlList[stringr::str_extract(string = htmlList, pattern = "[[:digit:]]+[[:punct:]]html") %>% stringr::str_sub(start = 1L, end = -6L) %>% as.integer() %>% order()]
pb <- progress_estimated(n = length(articlesHtml))
datesTxt <- purrr::map_chr(.x = articlesHtml, .f = function(x, .pb = NULL) {
if ((!is.null(.pb)) && inherits(.pb, "Progress") && (.pb$i < .pb$n)) .pb$tick()$print()
xml2::read_html(x) %>%
rvest::html_nodes("meta[name=Date]") %>%
rvest::html_attr('content')
}, .pb=pb)
## Extract dates
dates <- ExtractDates(articlesHtml = datesTxt,
dateFormat = "dmY")
## Check how many dates have not been retrieved
sum(is.na(dates))
## Extract ID
articlesId <- ExtractId()
## Set language
language <- "english"
## Export metadata
metadata <- ExportMetadata(dates = dates,
id = articlesId,
titles = titles,
language = language,
links = articlesLinks,
exportXlsx = TRUE)
## Extract text
contents <- ExtractTxt(articlesHtml = articlesHtml,
metadata = metadata,
divId = "contentPressRelease",
removeEverythingAfter = "Contacts :")
i <- sample(x = 1:length(articlesLinks), 1)
titles[i]
dates[i]
contents[i]
# browseURL(url = articlesLinks[i])
## Saves environment and export datasets to dedicated folder
SaveWebsite(saveEnvironment = TRUE, dataset = TRUE)
## Archive folders
ArchiveFolders(removeArchivedFolders = TRUE)
#RestoreArchives(html = TRUE)
#### End of website download ###
## export dataset
project <- "EuropeanUnion"
website <- "EuropeanCommission"
saveRDS(object = LoadDatasets() %>% mutate(date=as.Date(date)), file = "dataset.rds")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment