briatte/01-scrape.r

## 01-scrape.r
library(dplyr)
library(httr)
library(rvest)
library(stringr)

dir.create("data" , showWarnings = FALSE)
dir.create("raw"  , showWarnings = FALSE)
dir.create("docs" , showWarnings = FALSE)

for (i in c(2015:1990)) {

  docs = str_c("data/docs-", i, ".csv")
  if (!file.exists(docs)) {

    d = data_frame()

    # (1) get lists of nominative measures
    dir.create(str_c("raw/", i), showWarnings = FALSE)

    # iinitialized by absence of first page (skips existing records)
    f = str_c("raw/", i, "/001.html")
    if (!file.exists(f)) {

      cat("Year", i)
      GET("http://legifrance.gouv.fr/rechExpMesuresNominatives.do",
          query = list(champNom = "", champPrenom = "", champFonction = "",
                       champMinistere = "", champDecoration = "",
                       checkboxPeriode = "on",
                       champDatePublication1J = "01",
                       champDatePublication1M = "01",
                       champDatePublication1A = i,
                       champDatePublication2J = "31",
                       champDatePublication2M = "12",
                       champDatePublication2A = i)) %>%
        content("text", encoding = "UTF-8") %>%
        writeLines(f)

      h = read_html(f, encoding = "UTF-8")

      n = html_nodes(h, "h3") %>%
        html_text %>%
        str_extract("\\d+") %>%
        na.omit %>%
        as.integer

      if (!length(n)) {
        cat(": empty\n") # skips 1993
        next
      }

      p = 2:(n %/% 20 + (n %% 20 > 0)) # number of pages to get

      cat(":", str_pad(n, width = 5), "document(s)",
          str_pad(1 + length(p), 5), "pages to download\n")

      r = html_nodes(h, "a") %>%
        html_attr("href") %>%
        str_extract("rechExpMesuresNominatives.do(.*)fastReqId=\\d+") %>%
        na.omit %>%
        unique # reason why every search is unique and must start on page 1

      pb = txtProgressBar(max = length(p), style = 3)
      for (j in p) {

        f = str_c("raw/", i, "/", str_pad(j, width = 3, pad = "0"), ".html")
        GET(str_c("http://legifrance.gouv.fr/", r, "&page=", j)) %>%
          content("text", encoding = "UTF-8") %>%
          writeLines(f)

        setTxtProgressBar(pb, which(p == j))

      }

      cat("\n")

    }

    cat("Year", i)
    f = str_c("raw/", i) %>% list.files(full.names = TRUE)

    if (length(f) == 1) {
      cat(": empty\n") # skips 1993
      next
    }

    # parse the lists
    for (j in f) {

      h = read_html(j, encoding = "UTF-8") %>%
        html_nodes("li.resultat1 a")

      l = html_attr(h, "href") # links
      h = html_text(h) %>%
        str_replace_all("(\\\\r|\\\\n|\\n|\\\\t|\\s)+", " ") %>%
        str_trim # text
      w = (h != "Article")

      d = rbind(d, data_frame(year = i, doc = h[ w ], url = l[ w ],
                              title = NA, jorf = NA, nor = NA))

    }

    w = d$year == i
    cat(":", sum(w) %>% str_pad(width = 5), "document(s)")

    # data sample
    w = w & grepl("ambassad(eur|rice)|consul(e)? général(e)", tolower(d$doc))
    w = d$url[ w ]
    cat(length(w) %>% str_pad(width = 5), "documents sampled\n")

    # (2) get the actual documents
    if (length(w)) {

      a = data_frame()
      pb = txtProgressBar(max = length(w), style = 3)

      for (j in w) {

        f = str_extract(j, "JORFTEXT\\d+")
        f = str_c("docs/", f, ".html")
        if (!file.exists(f))
          download.file(str_c("http://legifrance.gouv.fr/", j),
                        f, mode = "wb", quiet = TRUE)

        h = read_html(f)
        setTxtProgressBar(pb, which(w == j))

        # fill in dataset columns
        d$title[ d$url == j] = html_nodes(h, "h2") %>%
          html_text %>%
          str_replace_all("(\\\\r|\\\\n|\\n|\\\\t|\\s)+", " ") %>%
          str_trim

        h = html_nodes(h, ".enteteTexte") %>% html_text

        d$jorf[ d$url == j] = str_extract(h, "JORF(.*)\\d{4}")
        d$nor[ d$url == j] = str_extract(h, "NOR: (.*)") %>%
          str_replace("NOR:\\s", "")

      }

      cat("\n")

    }

    write.csv(d, docs, row.names = FALSE)

  }

}

## 02-plot.r
library(ggplot2)
library(lubridate)

d = list.files("data", pattern = "docs-", full.names = TRUE) %>%
  lapply(read.csv, stringsAsFactors = FALSE) %>%
  bind_rows %>%
  filter(nchar(doc) > 0) %>% # remove a few empty rows
  mutate(type = str_extract(doc, "^(Arr(ê)?té|Citation|Décision|Décret|Exequatur|Liste|Résultats|Tableau)") %>%
           str_replace("Arrté", "Arrêté"),
         type = ifelse(type %in% c("Arrêté", "Décret"), type, "Divers"))

# elections examined
e = c(1995, 2002, 2007, 2012)

# time: quarters
d$year_q = str_replace(d$jorf, "JORF n°\\d+ du ", "") %>%
  parse_date_time("%d %m %Y", locale = "fr_FR") %>%
  quarter(with_year = TRUE)

# time: election examined
d$year_g = cut(d$year, c(1990, e[ -length(e) ] + 1, 2016), labels = e,
               include.lowest = TRUE) %>%
  as.character

# time: electoral/post-electoral quarters
d$elec = NA
d$elec[ d$year_g == "1995" ] = d$year_q[ d$year_g == "1995" ] >= 1995.2
d$elec[ d$year_g == "2002" ] = d$year_q[ d$year_g == "2002" ] >= 2002.2
d$elec[ d$year_g == "2007" ] = d$year_q[ d$year_g == "2007" ] >= 2007.2
d$elec[ d$year_g == "2012" ] = d$year_q[ d$year_g == "2012" ] >= 2012.2

# identify sampled rows
d$sample = !is.na(d$title) & d$year %in% c(e, e - 1, e - 2, e + 2)

# find gender of nominee
d$gender = ifelse(grepl("ambassadrice|consule", d$title), "Females", "Males")

# plot quantities of nominations
ggplot(filter(d, sample) %>%
         mutate(year_q = factor(year_q, labels = unique(year_q) %>%
                                  str_sub(-4)),
                year_g = str_c(year_g, " election"))) +
  geom_bar(aes(x = factor(year_q), alpha = elec)) +
  scale_alpha_manual("Period:", values = c("TRUE" = 1, "FALSE" = 0.5),
                     labels = c("pre-electoral", "post-electoral")) +
  facet_grid(gender ~ year_g, scales = "free", space = "free_x") +
  labs(x = "\nYear.Quarter", y = "Number of nomination decrees\n") +
  theme_bw() +
  theme(axis.text.x = element_text(size = rel(0.5)),
        legend.position = "bottom",
        panel.grid = element_blank())

ggsave("jorf.png", width = 12, height = 6)
	library(dplyr)
	library(httr)
	library(rvest)
	library(stringr)

	dir.create("data" , showWarnings = FALSE)
	dir.create("raw" , showWarnings = FALSE)
	dir.create("docs" , showWarnings = FALSE)

	for (i in c(2015:1990)) {

	docs = str_c("data/docs-", i, ".csv")
	if (!file.exists(docs)) {

	d = data_frame()

	# (1) get lists of nominative measures
	dir.create(str_c("raw/", i), showWarnings = FALSE)

	# iinitialized by absence of first page (skips existing records)
	f = str_c("raw/", i, "/001.html")
	if (!file.exists(f)) {

	cat("Year", i)
	GET("http://legifrance.gouv.fr/rechExpMesuresNominatives.do",
	query = list(champNom = "", champPrenom = "", champFonction = "",
	champMinistere = "", champDecoration = "",
	checkboxPeriode = "on",
	champDatePublication1J = "01",
	champDatePublication1M = "01",
	champDatePublication1A = i,
	champDatePublication2J = "31",
	champDatePublication2M = "12",
	champDatePublication2A = i)) %>%
	content("text", encoding = "UTF-8") %>%
	writeLines(f)

	h = read_html(f, encoding = "UTF-8")

	n = html_nodes(h, "h3") %>%
	html_text %>%
	str_extract("\\d+") %>%
	na.omit %>%
	as.integer

	if (!length(n)) {
	cat(": empty\n") # skips 1993
	next
	}

	p = 2:(n %/% 20 + (n %% 20 > 0)) # number of pages to get

	cat(":", str_pad(n, width = 5), "document(s)",
	str_pad(1 + length(p), 5), "pages to download\n")

	r = html_nodes(h, "a") %>%
	html_attr("href") %>%
	str_extract("rechExpMesuresNominatives.do(.*)fastReqId=\\d+") %>%
	na.omit %>%
	unique # reason why every search is unique and must start on page 1

	pb = txtProgressBar(max = length(p), style = 3)
	for (j in p) {

	f = str_c("raw/", i, "/", str_pad(j, width = 3, pad = "0"), ".html")
	GET(str_c("http://legifrance.gouv.fr/", r, "&page=", j)) %>%
	content("text", encoding = "UTF-8") %>%
	writeLines(f)

	setTxtProgressBar(pb, which(p == j))

	}

	cat("\n")

	}

	cat("Year", i)
	f = str_c("raw/", i) %>% list.files(full.names = TRUE)

	if (length(f) == 1) {
	cat(": empty\n") # skips 1993
	next
	}

	# parse the lists
	for (j in f) {

	h = read_html(j, encoding = "UTF-8") %>%
	html_nodes("li.resultat1 a")

	l = html_attr(h, "href") # links
	h = html_text(h) %>%
	str_replace_all("(\\\\r\|\\\\n\|\\n\|\\\\t\|\\s)+", " ") %>%
	str_trim # text
	w = (h != "Article")

	d = rbind(d, data_frame(year = i, doc = h[ w ], url = l[ w ],
	title = NA, jorf = NA, nor = NA))

	}

	w = d$year == i
	cat(":", sum(w) %>% str_pad(width = 5), "document(s)")

	# data sample
	w = w & grepl("ambassad(eur\|rice)\|consul(e)? général(e)", tolower(d$doc))
	w = d$url[ w ]
	cat(length(w) %>% str_pad(width = 5), "documents sampled\n")

	# (2) get the actual documents
	if (length(w)) {

	a = data_frame()
	pb = txtProgressBar(max = length(w), style = 3)

	for (j in w) {

	f = str_extract(j, "JORFTEXT\\d+")
	f = str_c("docs/", f, ".html")
	if (!file.exists(f))
	download.file(str_c("http://legifrance.gouv.fr/", j),
	f, mode = "wb", quiet = TRUE)

	h = read_html(f)
	setTxtProgressBar(pb, which(w == j))

	# fill in dataset columns
	d$title[ d$url == j] = html_nodes(h, "h2") %>%
	html_text %>%
	str_replace_all("(\\\\r\|\\\\n\|\\n\|\\\\t\|\\s)+", " ") %>%
	str_trim

	h = html_nodes(h, ".enteteTexte") %>% html_text

	d$jorf[ d$url == j] = str_extract(h, "JORF(.*)\\d{4}")
	d$nor[ d$url == j] = str_extract(h, "NOR: (.*)") %>%
	str_replace("NOR:\\s", "")

	}

	cat("\n")

	}

	write.csv(d, docs, row.names = FALSE)

	}

	}
	library(ggplot2)
	library(lubridate)

	d = list.files("data", pattern = "docs-", full.names = TRUE) %>%
	lapply(read.csv, stringsAsFactors = FALSE) %>%
	bind_rows %>%
	filter(nchar(doc) > 0) %>% # remove a few empty rows
	mutate(type = str_extract(doc, "^(Arr(ê)?té\|Citation\|Décision\|Décret\|Exequatur\|Liste\|Résultats\|Tableau)") %>%
	str_replace("Arrté", "Arrêté"),
	type = ifelse(type %in% c("Arrêté", "Décret"), type, "Divers"))

	# elections examined
	e = c(1995, 2002, 2007, 2012)

	# time: quarters
	d$year_q = str_replace(d$jorf, "JORF n°\\d+ du ", "") %>%
	parse_date_time("%d %m %Y", locale = "fr_FR") %>%
	quarter(with_year = TRUE)

	# time: election examined
	d$year_g = cut(d$year, c(1990, e[ -length(e) ] + 1, 2016), labels = e,
	include.lowest = TRUE) %>%
	as.character

	# time: electoral/post-electoral quarters
	d$elec = NA
	d$elec[ d$year_g == "1995" ] = d$year_q[ d$year_g == "1995" ] >= 1995.2
	d$elec[ d$year_g == "2002" ] = d$year_q[ d$year_g == "2002" ] >= 2002.2
	d$elec[ d$year_g == "2007" ] = d$year_q[ d$year_g == "2007" ] >= 2007.2
	d$elec[ d$year_g == "2012" ] = d$year_q[ d$year_g == "2012" ] >= 2012.2

	# identify sampled rows
	d$sample = !is.na(d$title) & d$year %in% c(e, e - 1, e - 2, e + 2)

	# find gender of nominee
	d$gender = ifelse(grepl("ambassadrice\|consule", d$title), "Females", "Males")

	# plot quantities of nominations
	ggplot(filter(d, sample) %>%
	mutate(year_q = factor(year_q, labels = unique(year_q) %>%
	str_sub(-4)),
	year_g = str_c(year_g, " election"))) +
	geom_bar(aes(x = factor(year_q), alpha = elec)) +
	scale_alpha_manual("Period:", values = c("TRUE" = 1, "FALSE" = 0.5),
	labels = c("pre-electoral", "post-electoral")) +
	facet_grid(gender ~ year_g, scales = "free", space = "free_x") +
	labs(x = "\nYear.Quarter", y = "Number of nomination decrees\n") +
	theme_bw() +
	theme(axis.text.x = element_text(size = rel(0.5)),
	legend.position = "bottom",
	panel.grid = element_blank())

	ggsave("jorf.png", width = 12, height = 6)