Skip to content

Instantly share code, notes, and snippets.

@giocomai
Last active June 28, 2017 13:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save giocomai/2d8e998cd8f85e8bdd857c8d4d99a1c9 to your computer and use it in GitHub Desktop.
Save giocomai/2d8e998cd8f85e8bdd857c8d4d99a1c9 to your computer and use it in GitHub Desktop.
Castarter template for downloading a website, extracting metadata and exporting a dataset in R
## Install castarter (devtools required for installing from github)
# install.packages("devtools")
devtools::install_github("giocomai/castarter")
## Load castarter
library("castarter")
## Set project and website name
SetCastarter(project = "EuropeanUnion", website = "EuropeanParliament")
## Create folder structure
CreateFolders()
## Preliminarily save environment
SaveWebsite(saveEnvironment = TRUE, dataset = FALSE)
## Remember to save also the R file, ideally with the same file name as the .RData that has just been saved
## Create indexLinks
indexLinks <- CreateLinks(
linkFirstChunk = "http://www.europarl.europa.eu/news/en/news-room/press-release?start=",
startPage = 0,
endPage = 5130,
increaseBy = 10)
## Download indexLinks and re-download files of oddly small size.
DownloadContents(links = indexLinks, type = "index")
DownloadContents(links = indexLinks, type = "index", missingArticles = FALSE)
indexHtml <- ImportHtml(from = "index")
## Extract articlesLinks
articlesLinks <- ExtractLinks(domain = "http://www.europarl.europa.eu/",
partOfLink = "/news/en/news-room/",
html = indexHtml)
head(articlesLinks)
## Download articles
DownloadContents(links = articlesLinks, type = "articles")
DownloadContents(links = articlesLinks, type = "articles", missingArticles = FALSE)
articlesHtml <- ImportHtml(from = "articles")
## Check a random article online
# browseURL(url = sample(x = articlesLinks, size = 1))
## Extract titles
titles <- ExtractTitles(articlesHtml = articlesHtml,
links = articlesLinks,
method = "htmlTitle")
head(titles)
## Extract dates
dates <- ExtractDates(articlesHtml = articlesHtml,
dateFormat = "dmY")
## Check how many dates have not been retrieved
sum(is.na(dates))
## Extract ID
articlesId <- ExtractId()
## Set language
language <- "english"
## Export metadata
metadata <- ExportMetadata(dates = dates,
id = articlesId,
titles = titles,
language = language,
links = articlesLinks,
exportXlsx = TRUE)
## Extract text
contents <- ExtractTxt(articlesHtml = articlesHtml, metadata = metadata)
i <- sample(x = 1:length(contents), 1)
titles[i]
dates[i]
contents[i]
# browseURL(url = articlesLinks[i])
## Saves environment and export datasets to dedicated folder
SaveWebsite(saveEnvironment = TRUE, dataset = TRUE)
## Archive folders
ArchiveFolders(removeArchivedFolders = TRUE)
#### End of website download ###
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment