Skip to content

Instantly share code, notes, and snippets.

@briatte
Last active October 7, 2016 16:11
Show Gist options
  • Save briatte/df0f41f5ce97443e0a0a to your computer and use it in GitHub Desktop.
Save briatte/df0f41f5ce97443e0a0a to your computer and use it in GitHub Desktop.
100-lines scraper for plenary statements by Members of the European Parliament — see briatte/euspeech for the full project
library(XML)
library(jsonlite)
library(plyr)
dir.create("records")
data = "meps.csv"
if(!file.exists(data)) {
html = "http://www.europarl.europa.eu/meps/en/directory.html?filter=all&leg="
html = htmlParse(html, encoding = "UTF-8")
# index page
root = function(x) paste0("//div[@class='zone_info_mep']/div[@class='mep_details']/ul/li", x)
link = xpathSApply(html, root("[@class='mep_name']/a/@href"))
name = xpathSApply(html, root("[@class='mep_name']"))
name = sapply(name, xmlValue)
# # MEPs currently in power (different page, ongoing legislature only)
# natl = xpathSApply(html, root("[contains(@class, 'nationality')]/@class"))
# natl = gsub("nationality ", "", natl)
# group = xpathSApply(html, root("[contains(@class, 'group')]/@class"))
# group = gsub("group ", "", group)
# party = xpathSApply(html, root("[contains(@class, 'nationality')]/span"))
# party = sapply(party, xmlValue)
# party = gsub("\\\"", "", party)
# # add memberships from individual MEP pages (ongoing legislature only)
# member = sapply(link, function(x) {
# print(x)
# html = htmlParse(paste0("http://www.europarl.europa.eu/", x))
# root = "//ul[@class='events_collection']"
# html = sapply(xpathSApply(html, paste0(root, "/*/acronym | ", root, "/*/*/acronym")), xmlValue)
# return(paste0(html, collapse = ";"))
# })
write.csv(data.frame(link, name), data) # , natl, party, group, member
}
data = read.csv(data, stringsAsFactors = FALSE)
get_cre <- function(id, leg = 7, verbose = TRUE) {
if(verbose)
cat("\n", id, "legislature", leg)
rec = data.frame()
idx = 0
while(idx > -1) {
if(verbose)
cat(" ", idx, "...")
x = paste0("http://www.europarl.europa.eu/meps/en/", id,
"/see_more.html?type=CRE&leg=", leg, "&index=", idx)
x = try(fromJSON(readLines(x, warn = FALSE), flatten = TRUE))
if("try-error" %in% class(x)) {
warning("Scraper error: MEP ", id)
} else {
idx = x$nextIndex
if(class(x$documentList) == "data.frame")
rec = rbind(rec, cbind(leg, x$documentList))
if(!idx)
idx = -1
}
}
return(rec)
}
sanitize <- function(x) return(ifelse(length(x) < 1, NA, paste0(x, collapse = ";")))
files = gsub("/meps/en/(\\d+)(.*)", "\\1", data$link)
for(i in files) {
file = paste0("records/", i, "_cre.csv")
if(!file.exists(file)) {
record = lapply(1:7, function(y) get_cre(i, y))
record = rbind.fill(record)
if(length(record) > 0) {
record = data.frame(id = i, record)
record$formatList = sapply(record$formatList, sanitize)
record$committeeList = sapply(record$committeeList, sanitize)
record$voteExplanationList = sapply(record$voteExplanationList, sanitize)
record = lapply(record, unlist)
write.csv(record, file)
}
message(paste("Scraped: MEP", i, length(files) - which(files == i), "left"))
}
}
# kthxbye
files = gsub("/meps/en/(\\d+)(.*)", "\\1", data$link)
for(i in files) {
file = paste0("records/", i, "_cre.csv")
if(!file.exists(file)) {
record = lapply(1:7, function(y) get_cre(i, y))
record = rbind.fill(record)
if(length(record) > 0) {
record = data.frame(id = i, record)
record$formatList = sapply(record$formatList, sanitize)
record$committeeList = sapply(record$committeeList, sanitize)
record$voteExplanationList = sapply(record$voteExplanationList, sanitize)
record = lapply(record, unlist)
write.csv(record, file)
}
message(paste("Scraped: MEP", i, length(files) - which(files == i), "left"))
}
}
get_cre <- function(id, leg = 7, verbose = TRUE) {
rec = data.frame()
idx = 0
while(idx > -1) {
x = paste0("http://www.europarl.europa.eu/meps/en/", id,
"/see_more.html?type=CRE&leg=", leg, "&index=", idx)
x = try(fromJSON(readLines(x, warn = FALSE), flatten = TRUE))
if("try-error" %in% class(x)) {
warning("Scraper error: MEP ", id)
} else {
idx = x$nextIndex
if(class(x$documentList) == "data.frame")
rec = rbind(rec, cbind(leg, x$documentList))
if(!idx)
idx = -1
}
}
return(rec)
}
data = "meps.csv"
if(!file.exists(data)) {
html = "http://www.europarl.europa.eu/meps/en/directory.html?filter=all&leg="
html = htmlParse(html, encoding = "UTF-8")
root = function(x) paste0("//div[@class='zone_info_mep']/div[@class='mep_details']/ul/li", x)
link = xpathSApply(html, root("[@class='mep_name']/a/@href"))
name = xpathSApply(html, root("[@class='mep_name']"))
name = sapply(name, xmlValue)
write.csv(data.frame(link, name), data)
}
data = read.csv(data, stringsAsFactors = FALSE)
sanitize <- function(x) return(ifelse(length(x) < 1, NA, paste0(x, collapse = ";")))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment