Skip to content

Instantly share code, notes, and snippets.

@briatte
Created July 24, 2014 12:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save briatte/507140640814cb0b7e31 to your computer and use it in GitHub Desktop.
Save briatte/507140640814cb0b7e31 to your computer and use it in GitHub Desktop.
download all asset declarations from French MPs, July 2014
# parse XPath syntax from well-formed HTML
library(XML)
# complete archive will take ~ 1.4 GB on disk
dir.create("declarations", showWarnings = FALSE)
# finds 941 MPs on 2014-07-24 at website launch
h = htmlParse("http://www.hatvp.fr/consulter-les-declarations-rechercher.html")
h = paste0("http://www.hatvp.fr/", xpathSApply(h, "//div[@id='annuaire']/*/*/*/a/@href"))
for(i in h) {
cat(sprintf("%0.3g", length(h) - which(h == i)), i)
j = gsub("http://www.hatvp.fr/pages_nominatives", "declarations", i)
j = gsub("html$", "pdf", j)
# if no file in declarations folder matches the MP name in the URL
if(!length(dir("declarations", gsub("declarations/|.pdf", "", j)))) {
k = xpathSApply(htmlParse(i), "//a[contains(@href, '.pdf')]/@href")
# multiple, single or no declaration available
if(length(k) > 1) {
for(s in 1:length(k)) {
download.file(gsub("\\.\\.", "http://www.hatvp.fr", k[s]),
paste0(gsub(".pdf", "", j), "-", s, ".pdf"), quiet = TRUE)
}
cat(" [ downloaded", length(k), "files ]\n")
} else if(is.null(k)) {
cat(" [ no file ]\n")
} else {
download.file(gsub("\\.\\.", "http://www.hatvp.fr", k), j, quiet = TRUE)
cat(" [ downloaded", file.info(j)$size / 1000, "KB ]\n")
}
} else {
cat(" [ skipped ]\n")
}
}
f = dir("declarations", "pdf$", full.names = TRUE)
# a few MPs have an initial declaration and a correction for it
cat(length(unique(gsub("(-\\d)?.pdf$", "", f))), "MPs", length(f), "files",
round(sum(file.info(f)$size) / 10^9, 1), "GB\n")
# save a plain text file manifest with file sizes and creation dates
write.table(file.info(f)[, c("size", "ctime") ],
"declarations/manifest.txt", quote = FALSE)
# have a nice day
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment