Last active
October 7, 2016 16:11
-
-
Save briatte/df0f41f5ce97443e0a0a to your computer and use it in GitHub Desktop.
100-lines scraper for plenary statements by Members of the European Parliament — see briatte/euspeech for the full project
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(XML) | |
library(jsonlite) | |
library(plyr) | |
dir.create("records") | |
data = "meps.csv" | |
if(!file.exists(data)) { | |
html = "http://www.europarl.europa.eu/meps/en/directory.html?filter=all&leg=" | |
html = htmlParse(html, encoding = "UTF-8") | |
# index page | |
root = function(x) paste0("//div[@class='zone_info_mep']/div[@class='mep_details']/ul/li", x) | |
link = xpathSApply(html, root("[@class='mep_name']/a/@href")) | |
name = xpathSApply(html, root("[@class='mep_name']")) | |
name = sapply(name, xmlValue) | |
# # MEPs currently in power (different page, ongoing legislature only) | |
# natl = xpathSApply(html, root("[contains(@class, 'nationality')]/@class")) | |
# natl = gsub("nationality ", "", natl) | |
# group = xpathSApply(html, root("[contains(@class, 'group')]/@class")) | |
# group = gsub("group ", "", group) | |
# party = xpathSApply(html, root("[contains(@class, 'nationality')]/span")) | |
# party = sapply(party, xmlValue) | |
# party = gsub("\\\"", "", party) | |
# # add memberships from individual MEP pages (ongoing legislature only) | |
# member = sapply(link, function(x) { | |
# print(x) | |
# html = htmlParse(paste0("http://www.europarl.europa.eu/", x)) | |
# root = "//ul[@class='events_collection']" | |
# html = sapply(xpathSApply(html, paste0(root, "/*/acronym | ", root, "/*/*/acronym")), xmlValue) | |
# return(paste0(html, collapse = ";")) | |
# }) | |
write.csv(data.frame(link, name), data) # , natl, party, group, member | |
} | |
data = read.csv(data, stringsAsFactors = FALSE) | |
get_cre <- function(id, leg = 7, verbose = TRUE) { | |
if(verbose) | |
cat("\n", id, "legislature", leg) | |
rec = data.frame() | |
idx = 0 | |
while(idx > -1) { | |
if(verbose) | |
cat(" ", idx, "...") | |
x = paste0("http://www.europarl.europa.eu/meps/en/", id, | |
"/see_more.html?type=CRE&leg=", leg, "&index=", idx) | |
x = try(fromJSON(readLines(x, warn = FALSE), flatten = TRUE)) | |
if("try-error" %in% class(x)) { | |
warning("Scraper error: MEP ", id) | |
} else { | |
idx = x$nextIndex | |
if(class(x$documentList) == "data.frame") | |
rec = rbind(rec, cbind(leg, x$documentList)) | |
if(!idx) | |
idx = -1 | |
} | |
} | |
return(rec) | |
} | |
sanitize <- function(x) return(ifelse(length(x) < 1, NA, paste0(x, collapse = ";"))) | |
files = gsub("/meps/en/(\\d+)(.*)", "\\1", data$link) | |
for(i in files) { | |
file = paste0("records/", i, "_cre.csv") | |
if(!file.exists(file)) { | |
record = lapply(1:7, function(y) get_cre(i, y)) | |
record = rbind.fill(record) | |
if(length(record) > 0) { | |
record = data.frame(id = i, record) | |
record$formatList = sapply(record$formatList, sanitize) | |
record$committeeList = sapply(record$committeeList, sanitize) | |
record$voteExplanationList = sapply(record$voteExplanationList, sanitize) | |
record = lapply(record, unlist) | |
write.csv(record, file) | |
} | |
message(paste("Scraped: MEP", i, length(files) - which(files == i), "left")) | |
} | |
} | |
# kthxbye |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
files = gsub("/meps/en/(\\d+)(.*)", "\\1", data$link) | |
for(i in files) { | |
file = paste0("records/", i, "_cre.csv") | |
if(!file.exists(file)) { | |
record = lapply(1:7, function(y) get_cre(i, y)) | |
record = rbind.fill(record) | |
if(length(record) > 0) { | |
record = data.frame(id = i, record) | |
record$formatList = sapply(record$formatList, sanitize) | |
record$committeeList = sapply(record$committeeList, sanitize) | |
record$voteExplanationList = sapply(record$voteExplanationList, sanitize) | |
record = lapply(record, unlist) | |
write.csv(record, file) | |
} | |
message(paste("Scraped: MEP", i, length(files) - which(files == i), "left")) | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
get_cre <- function(id, leg = 7, verbose = TRUE) { | |
rec = data.frame() | |
idx = 0 | |
while(idx > -1) { | |
x = paste0("http://www.europarl.europa.eu/meps/en/", id, | |
"/see_more.html?type=CRE&leg=", leg, "&index=", idx) | |
x = try(fromJSON(readLines(x, warn = FALSE), flatten = TRUE)) | |
if("try-error" %in% class(x)) { | |
warning("Scraper error: MEP ", id) | |
} else { | |
idx = x$nextIndex | |
if(class(x$documentList) == "data.frame") | |
rec = rbind(rec, cbind(leg, x$documentList)) | |
if(!idx) | |
idx = -1 | |
} | |
} | |
return(rec) | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
data = "meps.csv" | |
if(!file.exists(data)) { | |
html = "http://www.europarl.europa.eu/meps/en/directory.html?filter=all&leg=" | |
html = htmlParse(html, encoding = "UTF-8") | |
root = function(x) paste0("//div[@class='zone_info_mep']/div[@class='mep_details']/ul/li", x) | |
link = xpathSApply(html, root("[@class='mep_name']/a/@href")) | |
name = xpathSApply(html, root("[@class='mep_name']")) | |
name = sapply(name, xmlValue) | |
write.csv(data.frame(link, name), data) | |
} | |
data = read.csv(data, stringsAsFactors = FALSE) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sanitize <- function(x) return(ifelse(length(x) < 1, NA, paste0(x, collapse = ";"))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment