Skip to content

Instantly share code, notes, and snippets.

@briatte briatte/01-scrape.r
Last active Feb 2, 2016

Embed
What would you like to do?
scrape nominative decisions published in the French "Journal officiel"
library(dplyr)
library(httr)
library(rvest)
library(stringr)
dir.create("data" , showWarnings = FALSE)
dir.create("raw" , showWarnings = FALSE)
dir.create("docs" , showWarnings = FALSE)
for (i in c(2015:1990)) {
docs = str_c("data/docs-", i, ".csv")
if (!file.exists(docs)) {
d = data_frame()
# (1) get lists of nominative measures
dir.create(str_c("raw/", i), showWarnings = FALSE)
# iinitialized by absence of first page (skips existing records)
f = str_c("raw/", i, "/001.html")
if (!file.exists(f)) {
cat("Year", i)
GET("http://legifrance.gouv.fr/rechExpMesuresNominatives.do",
query = list(champNom = "", champPrenom = "", champFonction = "",
champMinistere = "", champDecoration = "",
checkboxPeriode = "on",
champDatePublication1J = "01",
champDatePublication1M = "01",
champDatePublication1A = i,
champDatePublication2J = "31",
champDatePublication2M = "12",
champDatePublication2A = i)) %>%
content("text", encoding = "UTF-8") %>%
writeLines(f)
h = read_html(f, encoding = "UTF-8")
n = html_nodes(h, "h3") %>%
html_text %>%
str_extract("\\d+") %>%
na.omit %>%
as.integer
if (!length(n)) {
cat(": empty\n") # skips 1993
next
}
p = 2:(n %/% 20 + (n %% 20 > 0)) # number of pages to get
cat(":", str_pad(n, width = 5), "document(s)",
str_pad(1 + length(p), 5), "pages to download\n")
r = html_nodes(h, "a") %>%
html_attr("href") %>%
str_extract("rechExpMesuresNominatives.do(.*)fastReqId=\\d+") %>%
na.omit %>%
unique # reason why every search is unique and must start on page 1
pb = txtProgressBar(max = length(p), style = 3)
for (j in p) {
f = str_c("raw/", i, "/", str_pad(j, width = 3, pad = "0"), ".html")
GET(str_c("http://legifrance.gouv.fr/", r, "&page=", j)) %>%
content("text", encoding = "UTF-8") %>%
writeLines(f)
setTxtProgressBar(pb, which(p == j))
}
cat("\n")
}
cat("Year", i)
f = str_c("raw/", i) %>% list.files(full.names = TRUE)
if (length(f) == 1) {
cat(": empty\n") # skips 1993
next
}
# parse the lists
for (j in f) {
h = read_html(j, encoding = "UTF-8") %>%
html_nodes("li.resultat1 a")
l = html_attr(h, "href") # links
h = html_text(h) %>%
str_replace_all("(\\\\r|\\\\n|\\n|\\\\t|\\s)+", " ") %>%
str_trim # text
w = (h != "Article")
d = rbind(d, data_frame(year = i, doc = h[ w ], url = l[ w ],
title = NA, jorf = NA, nor = NA))
}
w = d$year == i
cat(":", sum(w) %>% str_pad(width = 5), "document(s)")
# data sample
w = w & grepl("ambassad(eur|rice)|consul(e)? général(e)", tolower(d$doc))
w = d$url[ w ]
cat(length(w) %>% str_pad(width = 5), "documents sampled\n")
# (2) get the actual documents
if (length(w)) {
a = data_frame()
pb = txtProgressBar(max = length(w), style = 3)
for (j in w) {
f = str_extract(j, "JORFTEXT\\d+")
f = str_c("docs/", f, ".html")
if (!file.exists(f))
download.file(str_c("http://legifrance.gouv.fr/", j),
f, mode = "wb", quiet = TRUE)
h = read_html(f)
setTxtProgressBar(pb, which(w == j))
# fill in dataset columns
d$title[ d$url == j] = html_nodes(h, "h2") %>%
html_text %>%
str_replace_all("(\\\\r|\\\\n|\\n|\\\\t|\\s)+", " ") %>%
str_trim
h = html_nodes(h, ".enteteTexte") %>% html_text
d$jorf[ d$url == j] = str_extract(h, "JORF(.*)\\d{4}")
d$nor[ d$url == j] = str_extract(h, "NOR: (.*)") %>%
str_replace("NOR:\\s", "")
}
cat("\n")
}
write.csv(d, docs, row.names = FALSE)
}
}
library(ggplot2)
library(lubridate)
d = list.files("data", pattern = "docs-", full.names = TRUE) %>%
lapply(read.csv, stringsAsFactors = FALSE) %>%
bind_rows %>%
filter(nchar(doc) > 0) %>% # remove a few empty rows
mutate(type = str_extract(doc, "^(Arr(ê)?té|Citation|Décision|Décret|Exequatur|Liste|Résultats|Tableau)") %>%
str_replace("Arrté", "Arrêté"),
type = ifelse(type %in% c("Arrêté", "Décret"), type, "Divers"))
# elections examined
e = c(1995, 2002, 2007, 2012)
# time: quarters
d$year_q = str_replace(d$jorf, "JORF n°\\d+ du ", "") %>%
parse_date_time("%d %m %Y", locale = "fr_FR") %>%
quarter(with_year = TRUE)
# time: election examined
d$year_g = cut(d$year, c(1990, e[ -length(e) ] + 1, 2016), labels = e,
include.lowest = TRUE) %>%
as.character
# time: electoral/post-electoral quarters
d$elec = NA
d$elec[ d$year_g == "1995" ] = d$year_q[ d$year_g == "1995" ] >= 1995.2
d$elec[ d$year_g == "2002" ] = d$year_q[ d$year_g == "2002" ] >= 2002.2
d$elec[ d$year_g == "2007" ] = d$year_q[ d$year_g == "2007" ] >= 2007.2
d$elec[ d$year_g == "2012" ] = d$year_q[ d$year_g == "2012" ] >= 2012.2
# identify sampled rows
d$sample = !is.na(d$title) & d$year %in% c(e, e - 1, e - 2, e + 2)
# find gender of nominee
d$gender = ifelse(grepl("ambassadrice|consule", d$title), "Females", "Males")
# plot quantities of nominations
ggplot(filter(d, sample) %>%
mutate(year_q = factor(year_q, labels = unique(year_q) %>%
str_sub(-4)),
year_g = str_c(year_g, " election"))) +
geom_bar(aes(x = factor(year_q), alpha = elec)) +
scale_alpha_manual("Period:", values = c("TRUE" = 1, "FALSE" = 0.5),
labels = c("pre-electoral", "post-electoral")) +
facet_grid(gender ~ year_g, scales = "free", space = "free_x") +
labs(x = "\nYear.Quarter", y = "Number of nomination decrees\n") +
theme_bw() +
theme(axis.text.x = element_text(size = rel(0.5)),
legend.position = "bottom",
panel.grid = element_blank())
ggsave("jorf.png", width = 12, height = 6)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.