Last active
December 18, 2015 14:32
-
-
Save briatte/005b174f49e04937c31e to your computer and use it in GitHub Desktop.
scrape the Front national's online news items
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
library(ggplot2) | |
library(httr) | |
library(lubridate) | |
library(rvest) | |
library(stringr) | |
dir.create("pages", showWarnings = FALSE) | |
b = data.frame() | |
cat("Downloading pages ") | |
for(i in 228:1) { | |
cat(i, "... ") | |
p = paste0("http://www.frontnational.com/actualites/page/", i) | |
f = paste0("pages/page", i, ".html") | |
if(!file.exists(f)) { | |
h = GET(p, user_agent("Whatever/0.0")) | |
writeLines(content(h, "text"), f) | |
} | |
h = read_html(f) | |
b = rbind(b, data.frame( | |
url = html_nodes(h, "#pk_content h3.pk_entry_title a") %>% html_attr("href"), | |
title = html_nodes(h, "#pk_content h3.pk_entry_title a") %>% html_text(), | |
meta = html_nodes(h, ".pk_entry_meta p") %>% html_text(), | |
stringsAsFactors = FALSE | |
)) | |
} | |
b$date = str_extract(b$meta, "(.*)\\d{4}/") | |
b$date = parse_date_time(str_sub(b$date, end = -2), "%d %B %Y", locale = "FR_fr") | |
b$date = as.Date(b$date) | |
b$category = sapply(str_split(b$meta, "/"), function(x) x[2]) | |
b$tags = sapply(str_split(b$meta, "/"), function(x) { | |
x = x[ grepl("Mots-clefs", x) ] | |
x = gsub("Mots-clefs: ", "", x) | |
ifelse(length(x), x, NA) | |
}) | |
# bugfixes (from the 'Médias' tags) | |
b$tags = tolower(b$tags) | |
b$tags = gsub("\\.", "", b$tags) | |
b$tags = gsub("municipales(,|$)", "municipales 2014,", b$tags) | |
b$tags = gsub("munciipales", "municipales", b$tags) | |
b$tags = gsub("parti socialiste", "ps", b$tags) | |
b$tags = gsub("taxi(,|$)", "taxis", b$tags) | |
# more bugfixes | |
b$tags = gsub("européennes(,|$)|elections européennes 2014", "européennes 2014,", b$tags) | |
b$tags = gsub("union européene", "union européenne", b$tags) | |
b$tags = gsub("udt 2013", "udt", b$tags) | |
b$tags = gsub("européeen", "européen", b$tags) | |
b$tags = gsub("energie(,|$)", "energies", b$tags) | |
b$tags = gsub("fraude(,|$)", "fraudes", b$tags) | |
b$tags = gsub("profanation(,|$)", "profanations", b$tags) | |
##b$tags = gsub("islamisme(,|$)", "???", b$tags) | |
b$tags = gsub("investissement(,|$)", "investissements", b$tags) | |
b$tags = gsub("privatisation(,|$)", "privatisations", b$tags) | |
b$tags = gsub("réforme territoriales", "réforme territoriale", b$tags) | |
b$tags = gsub("taxe(,|$)", "taxes", b$tags) | |
b$tags = gsub("faillite(,|$)", "faillites", b$tags) | |
b$tags = gsub("hollnade|hoolande", "françois hollande", b$tags) | |
b$tags = gsub("(françois )?hollande", "françois hollande", b$tags) | |
b$tags = gsub("(nicolas )?sarkozy", "nicolas sarkozy", b$tags) | |
# b$tags = gsub("(charlie )?hebdo", "charlie hebdo", b$tags) | |
b$tags = gsub("anti-fa", "antifascistes", b$tags) | |
b$tags = gsub("(foot)?ball", "football", b$tags) | |
b$tags = gsub("ultra(-)?libéralisme", "ultra-libéralisme", b$tags) | |
sort(unique(unlist(strsplit(b$tags, ",\\s?")))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(animation) | |
library(network) | |
library(GGally) | |
# 697 unique terms | |
unique(unlist(strsplit(b$tags, ",\\s?"))) | |
# most frequent ones | |
w = unlist(strsplit(b$tags, ",\\s?")) | |
w = table(w)[ table(w) > 100 ] | |
data.frame(w) | |
e = lapply(b$tags, function(x) { | |
y = unique(unlist(strsplit(x, ",\\s?"))) | |
y = expand.grid(i = y, j = y, stringsAsFactors = FALSE) | |
y = filter(y, i != j) | |
y = apply(y, 1, function(x) paste0(sort(x), collapse = "///")) | |
y = unique(y) | |
data.frame(i = gsub("(.*)///(.*)", "\\1", y), | |
j = gsub("(.*)///(.*)", "\\2", y), | |
stringsAsFactors = FALSE) | |
}) | |
e = bind_rows(e) | |
e = group_by(e, i, j) %>% summarise(n = n()) %>% data.frame | |
n = network(e[, 1:2 ], directed = FALSE) | |
set.edge.attribute(n, "weight", e[, 3]) | |
# cut weighted edges to four sizes | |
w = as.numeric(cut(e[, 3], c(0, 1, 2, 3, 4, Inf))) | |
w = w / max(w) | |
# full network | |
ggnet(n, size = 1, | |
segment.color = "black", | |
segment.alpha = .5, | |
segment.size = w) | |
ggsave("fn_network_0.png", width = 9, height = 9) | |
# sub-network for 5+ co-occurrences | |
nn = n | |
delete.edges(nn, which(w < 1)) | |
delete.vertices(nn, isolates(nn)) | |
ggnet(nn, size = 0, | |
segment.alpha = .5, segment.color = "black", | |
label.nodes = TRUE, | |
label.size = 3 * as.numeric(cut(degree(nn), unique(quantile(degree(n))), | |
include.lowest = TRUE))) | |
ggsave("fn_network_5.png", width = 9, height = 9) | |
# animation | |
saveGIF({ | |
w = sort(unique(e[, 3])) | |
for(i in w[ -length(w) ]) { | |
nn = n | |
delete.edges(nn, which(e[, 3] <= i)) | |
delete.vertices(nn, isolates(nn)) | |
g = ggnet(nn, size = 0, | |
segment.alpha = .5, segment.color = "black", | |
label.nodes = TRUE, | |
label.size = 3 * as.numeric(cut(degree(nn), unique(quantile(degree(n))), | |
include.lowest = TRUE))) + | |
ggtitle(paste("Associations apparaissant au moins", i, "fois")) | |
print(g) | |
} | |
}, movie.name = "fn_network.gif", interval = 1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## MEDIAS | |
table(b$category)[ grepl("Médias", names(table(b$category))) ] | |
m = b[ grepl("Médias", b$category), ] | |
## explore with this | |
## m$who = gsub("(.*) sur (.*)", "\\1", m$title) | |
m$who = str_extract(m$title, "Marine (L|l)e Pen|Marion( |-)Maréchal|Nicolas Bay|Gollnisch|F(l)?(orian|\\.) (P|p)hilippot|Louis Aliot|Steeve Briois|Sébastien Chenu|Stéphane Ravier|Thibault de la Tocnaye|Wallerand de Saint( |-)Just|Marie-Christine Arnautu|Bousquet-Cassagne|Gilbert Collard|Julien Sanchez|Julien Rochedy|Jean-Marie Le Pen|Lacoste-Lareymondie|Philippe Murer|Sophie Montel|Laurent Lopez|Gaëtan Dussausaye|David Rachline|Bernard Monot|Aymeric Chauprade|Michel Guiniot|Edouard Cavin|Jean-Claude Otto-Bruc|Dominique Martin") | |
cat("MEDIAS: missing", sum(is.na(m$who)), "out of", nrow(m), "items\n") | |
m$who = tolower(gsub("-", " ", m$who)) | |
m$who[ grepl("gollnisch", m$who) ] = "bruno gollnisch" | |
m$who[ grepl("philippot", m$who) ] = "florian philippot" | |
sort(unique(m$who)) | |
# selected people | |
w = table(m$who)[ table(m$who) > 10 ] | |
m$who2 = ifelse(m$who %in% names(w), m$who, "(autres)") | |
m$who2[ is.na(m$who) ] = NA | |
# year-month | |
m$ym = paste0(year(m$date), "-", month(m$date)) | |
# bars | |
qplot(data = m, fill = who2, x = date) + | |
theme_bw() + | |
theme(panel.grid.major.x = element_blank()) + | |
scale_fill_discrete("", na.value = NA) + | |
labs(y = "Nombre d'articles, catégorie 'Médias'\n", x = "\nAnnée-mois") | |
ggsave("fn_medias_bars.png", width = 9, height = 7) | |
m$bin = paste(year(m$date), str_pad(month(m$date), 2, pad = "0"), sep = "-") | |
mbins = group_by(m, bin) %>% | |
summarise(n = n(), | |
fp = sum(who2 == "florian philippot", na.rm = TRUE), | |
ratio = fp / n) %>% | |
mutate(year = str_sub(bin, 1, 4), month = str_sub(bin, start = -2)) | |
# 2014-03 = municipal | |
# 2014-05 = EU | |
# 2015-03 = departmental | |
# 2015-12 = regional | |
ggplot(filter(mbins, !grepl("-0(7|8)$", bin)), | |
aes(y = ratio, x = month, | |
fill = !grepl("2014-(03|05)|2015-(03|12)", bin))) + | |
geom_bar(stat = "identity") + | |
scale_fill_brewer(palette = "Set2") + | |
guides(fill = FALSE) + | |
theme_bw() + | |
theme(panel.grid.major.x = element_blank()) + | |
scale_fill_discrete("", na.value = NA) + | |
labs(y = "Philippot / Autres, catégorie 'Médias'\n", x = "\nMois") + | |
facet_grid(~year, scales = "free_x", space = "free_x") | |
ggsave("fn_medias_philippot_bars.png", width = 9, height = 7) | |
m$bin = paste(year(m$date), str_pad(month(m$date), 2, pad = "0"), sep = "-") | |
mbins = group_by(m, bin) %>% | |
summarise(n = n(), | |
fp = sum(who2 == "nicolas bay", na.rm = TRUE), | |
ratio = fp / n) %>% | |
mutate(year = str_sub(bin, 1, 4), month = str_sub(bin, start = -2)) | |
# 2014-03 = municipal | |
# 2014-05 = EU | |
# 2015-03 = departmental | |
# 2015-12 = regional | |
ggplot(filter(mbins, !grepl("-0(7|8)$", bin)), | |
aes(y = ratio, x = month, | |
fill = !grepl("2014-(03|05)|2015-(03|12)", bin))) + | |
geom_bar(stat = "identity") + | |
scale_fill_brewer(palette = "Set2") + | |
guides(fill = FALSE) + | |
theme_bw() + | |
theme(panel.grid.major.x = element_blank()) + | |
scale_fill_discrete("", na.value = NA) + | |
labs(y = "Bay / Autres, catégorie 'Médias'\n", x = "\nMois") + | |
facet_grid(~year, scales = "free_x", space = "free_x") | |
ggsave("fn_medias_bay_bars.png", width = 9, height = 7) | |
# lines | |
w = with(m, data.frame(ym = paste0(year(date), "-", sprintf("%02.0f", month(date))), | |
who = who2)) | |
w = group_by(w, ym, who) %>% summarise(n = n()) | |
qplot(data = w, x = ym, y = n, color = who, group = who, geom = "line") + | |
theme_bw() | |
# counts | |
filter(m, who2 != "(autres)") %>% | |
group_by(who2, ym) %>% | |
summarise(n = n()) %>% | |
group_by() %>% | |
arrange(-n) | |
## INTERVENTIONS | |
table(b$category)[ grepl("Interventions", names(table(b$category))) ] | |
i = b[ grepl("Interventions", b$category), ] | |
# different for Marion Maréchal-Le Pen; also added: Edouard Ferrand, | |
# Gilles Lebreton, Joëlle Mélin, Philippe Loiseau, Mireille d’Ornano | |
i$who = str_extract(i$title, "Marine (L|l)e Pen|Marion( |-)Mar(é|e)chal|Nicolas Bay|Gollnisch|F(l)?(orian|\\.) (P|p)hilippot|Louis Aliot|Steeve Briois|Sébastien Chenu|Stéphane Ravier|Thibault de la Tocnaye|Wallerand de Saint( |-)Just|Marie-Christine Arnautu|Bousquet-Cassagne|Gilbert Collard|Julien Sanchez|Julien Rochedy|Jean-Marie Le Pen|Lacoste-Lareymondie|Philippe Murer|Sophie Montel|Laurent Lopez|Gaëtan Dussausaye|David Rachline|Bernard Monot|Aymeric Chauprade|Michel Guiniot|Edouard Cavin|Jean-Claude Otto-Bruc|Dominique Martin|Edouard Ferrand|Gilles Lebreton|Joëlle Mélin|Philippe Loiseau|Mireille d’Ornano") | |
cat("INTERVENTIONS: missing", sum(is.na(i$who)), "out of", nrow(i), "items\n") | |
i$title[is.na(i$who)] | |
i$who = tolower(gsub("-", " ", i$who)) | |
i$who[ grepl("gollnisch", i$who) ] = "bruno gollnisch" | |
i$who[ grepl("philippot", i$who) ] = "florian philippot" | |
sort(unique(i$who)) | |
# selected people | |
w = table(i$who)[ table(i$who) > 5 ] | |
i$who2 = ifelse(i$who %in% names(w), i$who, "(autres)") | |
i$who2[ is.na(i$who) ] = NA | |
# year-month | |
i$ym = paste0(year(i$date), "-", month(i$date)) | |
# bars | |
qplot(data = i, fill = who2, x = date) + | |
theme_bw() + | |
theme(panel.grid.major.x = element_blank()) + | |
scale_fill_discrete("", na.value = NA) + | |
labs(y = "Nombre d'articles, catégorie 'Interventions'\n", x = "\nAnnée-mois") | |
ggsave("fn_interventions_bars.png", width = 9, height = 7) | |
# lines | |
w = with(i, data.frame(ym = paste0(year(date), "-", sprintf("%02.0f", month(date))), | |
who = who2)) | |
w = group_by(w, ym, who) %>% summarise(n = n()) | |
qplot(data = w, x = ym, y = n, color = who, group = who, geom = "line") + | |
theme_bw() | |
# counts | |
filter(i, who2 != "(autres)") %>% | |
group_by(who2, ym) %>% | |
summarise(n = n()) %>% | |
group_by() %>% | |
arrange(-n) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment