giocomai/2017-09-19 - EuropeanUnion - EuropeanCommission.R

## 2017-09-19 - EuropeanUnion - EuropeanCommission.R
## Install castarter (devtools required for installing from github)
# install.packages("devtools")
devtools::install_github(repo = "giocomai/castarter", ref = "development")

setwd("~/R")
## Load castarter
library("castarter")

## Set project and website name
SetCastarter(project = "EuropeanUnion", website = "EuropeanCommission")

## Create folder structure
CreateFolders()

## Preliminarily save environment
SaveWebsite(saveEnvironment = TRUE, dataset = FALSE)

## Remember to save also the R file, ideally with the same file name as the .RData that has just been saved

## Create indexLinks
indexLinks <- CreateLinks(
  linkFirstChunk = "http://europa.eu/rapid/search-result.htm?page=",
  linkSecondChunk = "&format=HTML&type=IP&type=STATEMENT&type=SPEECH&type=AC&size=10&locale=EN",
  startPage = 1,
  endPage = 5877,
  increaseBy = 1)

## Download indexLinks and re-download files of oddly small size.
DownloadContents(links = indexLinks, type = "index")
DownloadContents(links = indexLinks, type = "index", missingArticles = FALSE)

## Extract articlesLinks
articlesLinks <- ExtractLinks(domain = "http://europa.eu",
                              partOfLink = "rapid/press-release_")
articlesLinks <- articlesLinks[grepl(pattern = "_en.htm", x = articlesLinks)]


## Download articles
DownloadContents(links = articlesLinks, type = "articles", createScript = FALSE)
DownloadContents(links = articlesLinks, type = "articles", missingArticles = FALSE, size = 15000)
articlesHtml <- ImportHtml(from = "articles")

## Check a random article online
# browseURL(url = sample(x = articlesLinks, size = 1))

## Extract titles
titles <- ExtractTitles(articlesHtml = articlesHtml,
                        links = articlesLinks,
                        method = "htmlTitle",
                        removeString = "European Commission - PRESS RELEASES  - Press release - ")
# head(titles)
# project <- "EuropeanUnion"
# website <- "EuropeanCommission"
library(rvest)
library(purrr)

# htmlList <- list.files(path = file.path(project, website, "Html"), full.names = TRUE)
# htmlList <- htmlList[stringr::str_extract(string = htmlList, pattern = "[[:digit:]]+[[:punct:]]html") %>% stringr::str_sub(start = 1L, end = -6L) %>% as.integer() %>% order()]

pb <- progress_estimated(n = length(articlesHtml))

datesTxt <- purrr::map_chr(.x = articlesHtml, .f = function(x, .pb = NULL) {
  if ((!is.null(.pb)) && inherits(.pb, "Progress") && (.pb$i < .pb$n)) .pb$tick()$print()
  xml2::read_html(x) %>%
    rvest::html_nodes("meta[name=Date]") %>%
    rvest::html_attr('content')
}, .pb=pb)


## Extract dates
dates <- ExtractDates(articlesHtml = datesTxt,
                      dateFormat = "dmY")
## Check how many dates have not been retrieved
sum(is.na(dates))

## Extract ID
articlesId <- ExtractId()

## Set language
language <- "english"

## Export metadata
metadata <- ExportMetadata(dates = dates,
                           id = articlesId,
                           titles = titles,
                           language = language,
                           links = articlesLinks,
                           exportXlsx = TRUE)

## Extract text
contents <- ExtractTxt(articlesHtml = articlesHtml,
                       metadata = metadata,
                       divId = "contentPressRelease",
                       removeEverythingAfter = "Contacts :")

i <- sample(x = 1:length(articlesLinks), 1)
titles[i]
dates[i]
contents[i]
# browseURL(url = articlesLinks[i])

## Saves environment and export datasets to dedicated folder
SaveWebsite(saveEnvironment = TRUE, dataset = TRUE)

## Archive folders
ArchiveFolders(removeArchivedFolders = TRUE)
#RestoreArchives(html = TRUE)

#### End of website download ###

## export dataset
project <- "EuropeanUnion"
website <- "EuropeanCommission"
saveRDS(object = LoadDatasets() %>% mutate(date=as.Date(date)), file = "dataset.rds")
	## Install castarter (devtools required for installing from github)
	# install.packages("devtools")
	devtools::install_github(repo = "giocomai/castarter", ref = "development")

	setwd("~/R")
	## Load castarter
	library("castarter")

	## Set project and website name
	SetCastarter(project = "EuropeanUnion", website = "EuropeanCommission")

	## Create folder structure
	CreateFolders()

	## Preliminarily save environment
	SaveWebsite(saveEnvironment = TRUE, dataset = FALSE)

	## Remember to save also the R file, ideally with the same file name as the .RData that has just been saved

	## Create indexLinks
	indexLinks <- CreateLinks(
	linkFirstChunk = "http://europa.eu/rapid/search-result.htm?page=",
	linkSecondChunk = "&format=HTML&type=IP&type=STATEMENT&type=SPEECH&type=AC&size=10&locale=EN",
	startPage = 1,
	endPage = 5877,
	increaseBy = 1)

	## Download indexLinks and re-download files of oddly small size.
	DownloadContents(links = indexLinks, type = "index")
	DownloadContents(links = indexLinks, type = "index", missingArticles = FALSE)

	## Extract articlesLinks
	articlesLinks <- ExtractLinks(domain = "http://europa.eu",
	partOfLink = "rapid/press-release_")
	articlesLinks <- articlesLinks[grepl(pattern = "_en.htm", x = articlesLinks)]


	## Download articles
	DownloadContents(links = articlesLinks, type = "articles", createScript = FALSE)
	DownloadContents(links = articlesLinks, type = "articles", missingArticles = FALSE, size = 15000)
	articlesHtml <- ImportHtml(from = "articles")

	## Check a random article online
	# browseURL(url = sample(x = articlesLinks, size = 1))

	## Extract titles
	titles <- ExtractTitles(articlesHtml = articlesHtml,
	links = articlesLinks,
	method = "htmlTitle",
	removeString = "European Commission - PRESS RELEASES - Press release - ")
	# head(titles)
	# project <- "EuropeanUnion"
	# website <- "EuropeanCommission"
	library(rvest)
	library(purrr)

	# htmlList <- list.files(path = file.path(project, website, "Html"), full.names = TRUE)
	# htmlList <- htmlList[stringr::str_extract(string = htmlList, pattern = "[[:digit:]]+[[:punct:]]html") %>% stringr::str_sub(start = 1L, end = -6L) %>% as.integer() %>% order()]

	pb <- progress_estimated(n = length(articlesHtml))

	datesTxt <- purrr::map_chr(.x = articlesHtml, .f = function(x, .pb = NULL) {
	if ((!is.null(.pb)) && inherits(.pb, "Progress") && (.pb$i < .pb$n)) .pb$tick()$print()
	xml2::read_html(x) %>%
	rvest::html_nodes("meta[name=Date]") %>%
	rvest::html_attr('content')
	}, .pb=pb)


	## Extract dates
	dates <- ExtractDates(articlesHtml = datesTxt,
	dateFormat = "dmY")
	## Check how many dates have not been retrieved
	sum(is.na(dates))

	## Extract ID
	articlesId <- ExtractId()

	## Set language
	language <- "english"

	## Export metadata
	metadata <- ExportMetadata(dates = dates,
	id = articlesId,
	titles = titles,
	language = language,
	links = articlesLinks,
	exportXlsx = TRUE)

	## Extract text
	contents <- ExtractTxt(articlesHtml = articlesHtml,
	metadata = metadata,
	divId = "contentPressRelease",
	removeEverythingAfter = "Contacts :")

	i <- sample(x = 1:length(articlesLinks), 1)
	titles[i]
	dates[i]
	contents[i]
	# browseURL(url = articlesLinks[i])

	## Saves environment and export datasets to dedicated folder
	SaveWebsite(saveEnvironment = TRUE, dataset = TRUE)

	## Archive folders
	ArchiveFolders(removeArchivedFolders = TRUE)
	#RestoreArchives(html = TRUE)

	#### End of website download ###

	## export dataset
	project <- "EuropeanUnion"
	website <- "EuropeanCommission"
	saveRDS(object = LoadDatasets() %>% mutate(date=as.Date(date)), file = "dataset.rds")