giocomai/EUtemplate.R

## EUtemplate.R
## Install castarter (devtools required for installing from github)
# install.packages("devtools")
devtools::install_github("giocomai/castarter")

## Load castarter
library("castarter")

## Set project and website name
SetCastarter(project = "EuropeanUnion", website = "EuropeanParliament")

## Create folder structure
CreateFolders()

## Preliminarily save environment
SaveWebsite(saveEnvironment = TRUE, dataset = FALSE)

## Remember to save also the R file, ideally with the same file name as the .RData that has just been saved

## Create indexLinks
indexLinks <- CreateLinks(
    linkFirstChunk = "http://www.europarl.europa.eu/news/en/news-room/press-release?start=",
    startPage = 0,
    endPage = 5130,
    increaseBy = 10)

## Download indexLinks and re-download files of oddly small size.
DownloadContents(links = indexLinks, type = "index")
DownloadContents(links = indexLinks, type = "index", missingArticles = FALSE)
indexHtml <- ImportHtml(from = "index")

## Extract articlesLinks
articlesLinks <- ExtractLinks(domain = "http://www.europarl.europa.eu/",
                              partOfLink = "/news/en/news-room/",
                              html = indexHtml)
head(articlesLinks)

## Download articles
DownloadContents(links = articlesLinks, type = "articles")
DownloadContents(links = articlesLinks, type = "articles", missingArticles = FALSE)
articlesHtml <- ImportHtml(from = "articles")

## Check a random article online
# browseURL(url = sample(x = articlesLinks, size = 1))

## Extract titles
titles <- ExtractTitles(articlesHtml = articlesHtml,
                        links = articlesLinks,
                        method = "htmlTitle")
head(titles)

## Extract dates
dates <- ExtractDates(articlesHtml = articlesHtml,
                      dateFormat = "dmY")
## Check how many dates have not been retrieved
sum(is.na(dates))

## Extract ID
articlesId <- ExtractId()

## Set language
language <- "english"

## Export metadata
metadata <- ExportMetadata(dates = dates,
                           id = articlesId,
                           titles = titles,
                           language = language,
                           links = articlesLinks,
                           exportXlsx = TRUE)

## Extract text
contents <- ExtractTxt(articlesHtml = articlesHtml, metadata = metadata)
i <- sample(x = 1:length(contents), 1)
titles[i]
dates[i]
contents[i]
# browseURL(url = articlesLinks[i])

## Saves environment and export datasets to dedicated folder
SaveWebsite(saveEnvironment = TRUE, dataset = TRUE)

## Archive folders
ArchiveFolders(removeArchivedFolders = TRUE)

#### End of website download ###
	## Install castarter (devtools required for installing from github)
	# install.packages("devtools")
	devtools::install_github("giocomai/castarter")

	## Load castarter
	library("castarter")

	## Set project and website name
	SetCastarter(project = "EuropeanUnion", website = "EuropeanParliament")

	## Create folder structure
	CreateFolders()

	## Preliminarily save environment
	SaveWebsite(saveEnvironment = TRUE, dataset = FALSE)

	## Remember to save also the R file, ideally with the same file name as the .RData that has just been saved

	## Create indexLinks
	indexLinks <- CreateLinks(
	linkFirstChunk = "http://www.europarl.europa.eu/news/en/news-room/press-release?start=",
	startPage = 0,
	endPage = 5130,
	increaseBy = 10)

	## Download indexLinks and re-download files of oddly small size.
	DownloadContents(links = indexLinks, type = "index")
	DownloadContents(links = indexLinks, type = "index", missingArticles = FALSE)
	indexHtml <- ImportHtml(from = "index")

	## Extract articlesLinks
	articlesLinks <- ExtractLinks(domain = "http://www.europarl.europa.eu/",
	partOfLink = "/news/en/news-room/",
	html = indexHtml)
	head(articlesLinks)

	## Download articles
	DownloadContents(links = articlesLinks, type = "articles")
	DownloadContents(links = articlesLinks, type = "articles", missingArticles = FALSE)
	articlesHtml <- ImportHtml(from = "articles")

	## Check a random article online
	# browseURL(url = sample(x = articlesLinks, size = 1))

	## Extract titles
	titles <- ExtractTitles(articlesHtml = articlesHtml,
	links = articlesLinks,
	method = "htmlTitle")
	head(titles)

	## Extract dates
	dates <- ExtractDates(articlesHtml = articlesHtml,
	dateFormat = "dmY")
	## Check how many dates have not been retrieved
	sum(is.na(dates))

	## Extract ID
	articlesId <- ExtractId()

	## Set language
	language <- "english"

	## Export metadata
	metadata <- ExportMetadata(dates = dates,
	id = articlesId,
	titles = titles,
	language = language,
	links = articlesLinks,
	exportXlsx = TRUE)

	## Extract text
	contents <- ExtractTxt(articlesHtml = articlesHtml, metadata = metadata)
	i <- sample(x = 1:length(contents), 1)
	titles[i]
	dates[i]
	contents[i]
	# browseURL(url = articlesLinks[i])

	## Saves environment and export datasets to dedicated folder
	SaveWebsite(saveEnvironment = TRUE, dataset = TRUE)

	## Archive folders
	ArchiveFolders(removeArchivedFolders = TRUE)

	#### End of website download ###