scrape nominative decisions published in the French "Journal officiel"
library(dplyr) | |
library(httr) | |
library(rvest) | |
library(stringr) | |
dir.create("data" , showWarnings = FALSE) | |
dir.create("raw" , showWarnings = FALSE) | |
dir.create("docs" , showWarnings = FALSE) | |
for (i in c(2015:1990)) { | |
docs = str_c("data/docs-", i, ".csv") | |
if (!file.exists(docs)) { | |
d = data_frame() | |
# (1) get lists of nominative measures | |
dir.create(str_c("raw/", i), showWarnings = FALSE) | |
# iinitialized by absence of first page (skips existing records) | |
f = str_c("raw/", i, "/001.html") | |
if (!file.exists(f)) { | |
cat("Year", i) | |
GET("http://legifrance.gouv.fr/rechExpMesuresNominatives.do", | |
query = list(champNom = "", champPrenom = "", champFonction = "", | |
champMinistere = "", champDecoration = "", | |
checkboxPeriode = "on", | |
champDatePublication1J = "01", | |
champDatePublication1M = "01", | |
champDatePublication1A = i, | |
champDatePublication2J = "31", | |
champDatePublication2M = "12", | |
champDatePublication2A = i)) %>% | |
content("text", encoding = "UTF-8") %>% | |
writeLines(f) | |
h = read_html(f, encoding = "UTF-8") | |
n = html_nodes(h, "h3") %>% | |
html_text %>% | |
str_extract("\\d+") %>% | |
na.omit %>% | |
as.integer | |
if (!length(n)) { | |
cat(": empty\n") # skips 1993 | |
next | |
} | |
p = 2:(n %/% 20 + (n %% 20 > 0)) # number of pages to get | |
cat(":", str_pad(n, width = 5), "document(s)", | |
str_pad(1 + length(p), 5), "pages to download\n") | |
r = html_nodes(h, "a") %>% | |
html_attr("href") %>% | |
str_extract("rechExpMesuresNominatives.do(.*)fastReqId=\\d+") %>% | |
na.omit %>% | |
unique # reason why every search is unique and must start on page 1 | |
pb = txtProgressBar(max = length(p), style = 3) | |
for (j in p) { | |
f = str_c("raw/", i, "/", str_pad(j, width = 3, pad = "0"), ".html") | |
GET(str_c("http://legifrance.gouv.fr/", r, "&page=", j)) %>% | |
content("text", encoding = "UTF-8") %>% | |
writeLines(f) | |
setTxtProgressBar(pb, which(p == j)) | |
} | |
cat("\n") | |
} | |
cat("Year", i) | |
f = str_c("raw/", i) %>% list.files(full.names = TRUE) | |
if (length(f) == 1) { | |
cat(": empty\n") # skips 1993 | |
next | |
} | |
# parse the lists | |
for (j in f) { | |
h = read_html(j, encoding = "UTF-8") %>% | |
html_nodes("li.resultat1 a") | |
l = html_attr(h, "href") # links | |
h = html_text(h) %>% | |
str_replace_all("(\\\\r|\\\\n|\\n|\\\\t|\\s)+", " ") %>% | |
str_trim # text | |
w = (h != "Article") | |
d = rbind(d, data_frame(year = i, doc = h[ w ], url = l[ w ], | |
title = NA, jorf = NA, nor = NA)) | |
} | |
w = d$year == i | |
cat(":", sum(w) %>% str_pad(width = 5), "document(s)") | |
# data sample | |
w = w & grepl("ambassad(eur|rice)|consul(e)? général(e)", tolower(d$doc)) | |
w = d$url[ w ] | |
cat(length(w) %>% str_pad(width = 5), "documents sampled\n") | |
# (2) get the actual documents | |
if (length(w)) { | |
a = data_frame() | |
pb = txtProgressBar(max = length(w), style = 3) | |
for (j in w) { | |
f = str_extract(j, "JORFTEXT\\d+") | |
f = str_c("docs/", f, ".html") | |
if (!file.exists(f)) | |
download.file(str_c("http://legifrance.gouv.fr/", j), | |
f, mode = "wb", quiet = TRUE) | |
h = read_html(f) | |
setTxtProgressBar(pb, which(w == j)) | |
# fill in dataset columns | |
d$title[ d$url == j] = html_nodes(h, "h2") %>% | |
html_text %>% | |
str_replace_all("(\\\\r|\\\\n|\\n|\\\\t|\\s)+", " ") %>% | |
str_trim | |
h = html_nodes(h, ".enteteTexte") %>% html_text | |
d$jorf[ d$url == j] = str_extract(h, "JORF(.*)\\d{4}") | |
d$nor[ d$url == j] = str_extract(h, "NOR: (.*)") %>% | |
str_replace("NOR:\\s", "") | |
} | |
cat("\n") | |
} | |
write.csv(d, docs, row.names = FALSE) | |
} | |
} |
library(ggplot2) | |
library(lubridate) | |
d = list.files("data", pattern = "docs-", full.names = TRUE) %>% | |
lapply(read.csv, stringsAsFactors = FALSE) %>% | |
bind_rows %>% | |
filter(nchar(doc) > 0) %>% # remove a few empty rows | |
mutate(type = str_extract(doc, "^(Arr(ê)?té|Citation|Décision|Décret|Exequatur|Liste|Résultats|Tableau)") %>% | |
str_replace("Arrté", "Arrêté"), | |
type = ifelse(type %in% c("Arrêté", "Décret"), type, "Divers")) | |
# elections examined | |
e = c(1995, 2002, 2007, 2012) | |
# time: quarters | |
d$year_q = str_replace(d$jorf, "JORF n°\\d+ du ", "") %>% | |
parse_date_time("%d %m %Y", locale = "fr_FR") %>% | |
quarter(with_year = TRUE) | |
# time: election examined | |
d$year_g = cut(d$year, c(1990, e[ -length(e) ] + 1, 2016), labels = e, | |
include.lowest = TRUE) %>% | |
as.character | |
# time: electoral/post-electoral quarters | |
d$elec = NA | |
d$elec[ d$year_g == "1995" ] = d$year_q[ d$year_g == "1995" ] >= 1995.2 | |
d$elec[ d$year_g == "2002" ] = d$year_q[ d$year_g == "2002" ] >= 2002.2 | |
d$elec[ d$year_g == "2007" ] = d$year_q[ d$year_g == "2007" ] >= 2007.2 | |
d$elec[ d$year_g == "2012" ] = d$year_q[ d$year_g == "2012" ] >= 2012.2 | |
# identify sampled rows | |
d$sample = !is.na(d$title) & d$year %in% c(e, e - 1, e - 2, e + 2) | |
# find gender of nominee | |
d$gender = ifelse(grepl("ambassadrice|consule", d$title), "Females", "Males") | |
# plot quantities of nominations | |
ggplot(filter(d, sample) %>% | |
mutate(year_q = factor(year_q, labels = unique(year_q) %>% | |
str_sub(-4)), | |
year_g = str_c(year_g, " election"))) + | |
geom_bar(aes(x = factor(year_q), alpha = elec)) + | |
scale_alpha_manual("Period:", values = c("TRUE" = 1, "FALSE" = 0.5), | |
labels = c("pre-electoral", "post-electoral")) + | |
facet_grid(gender ~ year_g, scales = "free", space = "free_x") + | |
labs(x = "\nYear.Quarter", y = "Number of nomination decrees\n") + | |
theme_bw() + | |
theme(axis.text.x = element_text(size = rel(0.5)), | |
legend.position = "bottom", | |
panel.grid = element_blank()) | |
ggsave("jorf.png", width = 12, height = 6) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment