briatte/full.r

## full.r
library(XML)
library(jsonlite)
library(plyr)

dir.create("records")

data = "meps.csv"
if(!file.exists(data)) {

  html = "http://www.europarl.europa.eu/meps/en/directory.html?filter=all&leg="
  html = htmlParse(html, encoding = "UTF-8")

  # index page
  root = function(x) paste0("//div[@class='zone_info_mep']/div[@class='mep_details']/ul/li", x)
  link = xpathSApply(html, root("[@class='mep_name']/a/@href"))

  name = xpathSApply(html, root("[@class='mep_name']"))
  name = sapply(name, xmlValue)

#   # MEPs currently in power (different page, ongoing legislature only)
#   natl = xpathSApply(html, root("[contains(@class, 'nationality')]/@class"))
#   natl = gsub("nationality ", "", natl)
#   group = xpathSApply(html, root("[contains(@class, 'group')]/@class"))
#   group = gsub("group ", "", group)
#   party = xpathSApply(html, root("[contains(@class, 'nationality')]/span"))
#   party = sapply(party, xmlValue)
#   party = gsub("\\\"", "", party)

#   # add memberships from individual MEP pages (ongoing legislature only)
#   member = sapply(link, function(x) {
#     print(x)
#     html = htmlParse(paste0("http://www.europarl.europa.eu/", x))
#     root = "//ul[@class='events_collection']"
#     html = sapply(xpathSApply(html, paste0(root, "/*/acronym | ", root, "/*/*/acronym")), xmlValue)
#     return(paste0(html, collapse = ";"))
#   })

  write.csv(data.frame(link, name), data) # , natl, party, group, member

}

data = read.csv(data, stringsAsFactors = FALSE)

get_cre <- function(id, leg = 7, verbose = TRUE) {
  if(verbose)
    cat("\n", id, "legislature", leg)
  rec = data.frame()
  idx = 0
  while(idx > -1) {
    if(verbose)
      cat(" ", idx, "...")
    x = paste0("http://www.europarl.europa.eu/meps/en/", id,
               "/see_more.html?type=CRE&leg=", leg, "&index=", idx)
    x = try(fromJSON(readLines(x, warn = FALSE), flatten = TRUE))
    if("try-error" %in% class(x)) {
      warning("Scraper error: MEP ", id)
    } else {
      idx = x$nextIndex
      if(class(x$documentList) == "data.frame")
        rec = rbind(rec, cbind(leg, x$documentList))
      if(!idx)
        idx = -1
    }
  }
  return(rec)
}

sanitize <- function(x) return(ifelse(length(x) < 1, NA, paste0(x, collapse = ";")))

files = gsub("/meps/en/(\\d+)(.*)", "\\1", data$link)

for(i in files) {

  file = paste0("records/", i, "_cre.csv")

  if(!file.exists(file)) {

    record = lapply(1:7, function(y) get_cre(i, y))
    record = rbind.fill(record)

    if(length(record) > 0) {

      record = data.frame(id = i, record)
      record$formatList = sapply(record$formatList, sanitize)
      record$committeeList = sapply(record$committeeList, sanitize)
      record$voteExplanationList = sapply(record$voteExplanationList, sanitize)
      record = lapply(record, unlist)

      write.csv(record, file)

    }

    message(paste("Scraped: MEP", i, length(files) - which(files == i), "left"))

  }

}

# kthxbye

## get_all.r
files = gsub("/meps/en/(\\d+)(.*)", "\\1", data$link)

for(i in files) {

  file = paste0("records/", i, "_cre.csv")

  if(!file.exists(file)) {

    record = lapply(1:7, function(y) get_cre(i, y))
    record = rbind.fill(record)

    if(length(record) > 0) {

      record = data.frame(id = i, record)
      record$formatList = sapply(record$formatList, sanitize)
      record$committeeList = sapply(record$committeeList, sanitize)
      record$voteExplanationList = sapply(record$voteExplanationList, sanitize)
      record = lapply(record, unlist)

      write.csv(record, file)

    }

    message(paste("Scraped: MEP", i, length(files) - which(files == i), "left"))

  }

}

## get_cre.r
get_cre <- function(id, leg = 7, verbose = TRUE) {

  rec = data.frame()
  idx = 0

  while(idx > -1) {

    x = paste0("http://www.europarl.europa.eu/meps/en/", id,
               "/see_more.html?type=CRE&leg=", leg, "&index=", idx)
    x = try(fromJSON(readLines(x, warn = FALSE), flatten = TRUE))

    if("try-error" %in% class(x)) {
      warning("Scraper error: MEP ", id)

    } else {

      idx = x$nextIndex

      if(class(x$documentList) == "data.frame")
        rec = rbind(rec, cbind(leg, x$documentList))

      if(!idx)
        idx = -1
    }

  }

  return(rec)

}

## get_mps.r
data = "meps.csv"

if(!file.exists(data)) {

  html = "http://www.europarl.europa.eu/meps/en/directory.html?filter=all&leg="
  html = htmlParse(html, encoding = "UTF-8")

  root = function(x) paste0("//div[@class='zone_info_mep']/div[@class='mep_details']/ul/li", x)

  link = xpathSApply(html, root("[@class='mep_name']/a/@href"))

  name = xpathSApply(html, root("[@class='mep_name']"))
  name = sapply(name, xmlValue)

  write.csv(data.frame(link, name), data)

}

data = read.csv(data, stringsAsFactors = FALSE)

## sanitize.r
sanitize <- function(x) return(ifelse(length(x) < 1, NA, paste0(x, collapse = ";")))
	library(XML)
	library(jsonlite)
	library(plyr)

	dir.create("records")

	data = "meps.csv"
	if(!file.exists(data)) {

	html = "http://www.europarl.europa.eu/meps/en/directory.html?filter=all&leg="
	html = htmlParse(html, encoding = "UTF-8")

	# index page
	root = function(x) paste0("//div[@class='zone_info_mep']/div[@class='mep_details']/ul/li", x)
	link = xpathSApply(html, root("[@class='mep_name']/a/@href"))

	name = xpathSApply(html, root("[@class='mep_name']"))
	name = sapply(name, xmlValue)

	# # MEPs currently in power (different page, ongoing legislature only)
	# natl = xpathSApply(html, root("[contains(@class, 'nationality')]/@class"))
	# natl = gsub("nationality ", "", natl)
	# group = xpathSApply(html, root("[contains(@class, 'group')]/@class"))
	# group = gsub("group ", "", group)
	# party = xpathSApply(html, root("[contains(@class, 'nationality')]/span"))
	# party = sapply(party, xmlValue)
	# party = gsub("\\\"", "", party)

	# # add memberships from individual MEP pages (ongoing legislature only)
	# member = sapply(link, function(x) {
	# print(x)
	# html = htmlParse(paste0("http://www.europarl.europa.eu/", x))
	# root = "//ul[@class='events_collection']"
	# html = sapply(xpathSApply(html, paste0(root, "//acronym \| ", root, "//*/acronym")), xmlValue)
	# return(paste0(html, collapse = ";"))
	# })

	write.csv(data.frame(link, name), data) # , natl, party, group, member

	}

	data = read.csv(data, stringsAsFactors = FALSE)

	get_cre <- function(id, leg = 7, verbose = TRUE) {
	if(verbose)
	cat("\n", id, "legislature", leg)
	rec = data.frame()
	idx = 0
	while(idx > -1) {
	if(verbose)
	cat(" ", idx, "...")
	x = paste0("http://www.europarl.europa.eu/meps/en/", id,
	"/see_more.html?type=CRE&leg=", leg, "&index=", idx)
	x = try(fromJSON(readLines(x, warn = FALSE), flatten = TRUE))
	if("try-error" %in% class(x)) {
	warning("Scraper error: MEP ", id)
	} else {
	idx = x$nextIndex
	if(class(x$documentList) == "data.frame")
	rec = rbind(rec, cbind(leg, x$documentList))
	if(!idx)
	idx = -1
	}
	}
	return(rec)
	}

	sanitize <- function(x) return(ifelse(length(x) < 1, NA, paste0(x, collapse = ";")))

	files = gsub("/meps/en/(\\d+)(.*)", "\\1", data$link)

	for(i in files) {

	file = paste0("records/", i, "_cre.csv")

	if(!file.exists(file)) {

	record = lapply(1:7, function(y) get_cre(i, y))
	record = rbind.fill(record)

	if(length(record) > 0) {

	record = data.frame(id = i, record)
	record$formatList = sapply(record$formatList, sanitize)
	record$committeeList = sapply(record$committeeList, sanitize)
	record$voteExplanationList = sapply(record$voteExplanationList, sanitize)
	record = lapply(record, unlist)

	write.csv(record, file)

	}

	message(paste("Scraped: MEP", i, length(files) - which(files == i), "left"))

	}

	}

	# kthxbye