Skip to content

Instantly share code, notes, and snippets.

@briatte
Last active December 18, 2015 14:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save briatte/005b174f49e04937c31e to your computer and use it in GitHub Desktop.
Save briatte/005b174f49e04937c31e to your computer and use it in GitHub Desktop.
scrape the Front national's online news items
library(dplyr)
library(ggplot2)
library(httr)
library(lubridate)
library(rvest)
library(stringr)
dir.create("pages", showWarnings = FALSE)
b = data.frame()
cat("Downloading pages ")
for(i in 228:1) {
cat(i, "... ")
p = paste0("http://www.frontnational.com/actualites/page/", i)
f = paste0("pages/page", i, ".html")
if(!file.exists(f)) {
h = GET(p, user_agent("Whatever/0.0"))
writeLines(content(h, "text"), f)
}
h = read_html(f)
b = rbind(b, data.frame(
url = html_nodes(h, "#pk_content h3.pk_entry_title a") %>% html_attr("href"),
title = html_nodes(h, "#pk_content h3.pk_entry_title a") %>% html_text(),
meta = html_nodes(h, ".pk_entry_meta p") %>% html_text(),
stringsAsFactors = FALSE
))
}
b$date = str_extract(b$meta, "(.*)\\d{4}/")
b$date = parse_date_time(str_sub(b$date, end = -2), "%d %B %Y", locale = "FR_fr")
b$date = as.Date(b$date)
b$category = sapply(str_split(b$meta, "/"), function(x) x[2])
b$tags = sapply(str_split(b$meta, "/"), function(x) {
x = x[ grepl("Mots-clefs", x) ]
x = gsub("Mots-clefs: ", "", x)
ifelse(length(x), x, NA)
})
# bugfixes (from the 'Médias' tags)
b$tags = tolower(b$tags)
b$tags = gsub("\\.", "", b$tags)
b$tags = gsub("municipales(,|$)", "municipales 2014,", b$tags)
b$tags = gsub("munciipales", "municipales", b$tags)
b$tags = gsub("parti socialiste", "ps", b$tags)
b$tags = gsub("taxi(,|$)", "taxis", b$tags)
# more bugfixes
b$tags = gsub("européennes(,|$)|elections européennes 2014", "européennes 2014,", b$tags)
b$tags = gsub("union européene", "union européenne", b$tags)
b$tags = gsub("udt 2013", "udt", b$tags)
b$tags = gsub("européeen", "européen", b$tags)
b$tags = gsub("energie(,|$)", "energies", b$tags)
b$tags = gsub("fraude(,|$)", "fraudes", b$tags)
b$tags = gsub("profanation(,|$)", "profanations", b$tags)
##b$tags = gsub("islamisme(,|$)", "???", b$tags)
b$tags = gsub("investissement(,|$)", "investissements", b$tags)
b$tags = gsub("privatisation(,|$)", "privatisations", b$tags)
b$tags = gsub("réforme territoriales", "réforme territoriale", b$tags)
b$tags = gsub("taxe(,|$)", "taxes", b$tags)
b$tags = gsub("faillite(,|$)", "faillites", b$tags)
b$tags = gsub("hollnade|hoolande", "françois hollande", b$tags)
b$tags = gsub("(françois )?hollande", "françois hollande", b$tags)
b$tags = gsub("(nicolas )?sarkozy", "nicolas sarkozy", b$tags)
# b$tags = gsub("(charlie )?hebdo", "charlie hebdo", b$tags)
b$tags = gsub("anti-fa", "antifascistes", b$tags)
b$tags = gsub("(foot)?ball", "football", b$tags)
b$tags = gsub("ultra(-)?libéralisme", "ultra-libéralisme", b$tags)
sort(unique(unlist(strsplit(b$tags, ",\\s?"))))
library(animation)
library(network)
library(GGally)
# 697 unique terms
unique(unlist(strsplit(b$tags, ",\\s?")))
# most frequent ones
w = unlist(strsplit(b$tags, ",\\s?"))
w = table(w)[ table(w) > 100 ]
data.frame(w)
e = lapply(b$tags, function(x) {
y = unique(unlist(strsplit(x, ",\\s?")))
y = expand.grid(i = y, j = y, stringsAsFactors = FALSE)
y = filter(y, i != j)
y = apply(y, 1, function(x) paste0(sort(x), collapse = "///"))
y = unique(y)
data.frame(i = gsub("(.*)///(.*)", "\\1", y),
j = gsub("(.*)///(.*)", "\\2", y),
stringsAsFactors = FALSE)
})
e = bind_rows(e)
e = group_by(e, i, j) %>% summarise(n = n()) %>% data.frame
n = network(e[, 1:2 ], directed = FALSE)
set.edge.attribute(n, "weight", e[, 3])
# cut weighted edges to four sizes
w = as.numeric(cut(e[, 3], c(0, 1, 2, 3, 4, Inf)))
w = w / max(w)
# full network
ggnet(n, size = 1,
segment.color = "black",
segment.alpha = .5,
segment.size = w)
ggsave("fn_network_0.png", width = 9, height = 9)
# sub-network for 5+ co-occurrences
nn = n
delete.edges(nn, which(w < 1))
delete.vertices(nn, isolates(nn))
ggnet(nn, size = 0,
segment.alpha = .5, segment.color = "black",
label.nodes = TRUE,
label.size = 3 * as.numeric(cut(degree(nn), unique(quantile(degree(n))),
include.lowest = TRUE)))
ggsave("fn_network_5.png", width = 9, height = 9)
# animation
saveGIF({
w = sort(unique(e[, 3]))
for(i in w[ -length(w) ]) {
nn = n
delete.edges(nn, which(e[, 3] <= i))
delete.vertices(nn, isolates(nn))
g = ggnet(nn, size = 0,
segment.alpha = .5, segment.color = "black",
label.nodes = TRUE,
label.size = 3 * as.numeric(cut(degree(nn), unique(quantile(degree(n))),
include.lowest = TRUE))) +
ggtitle(paste("Associations apparaissant au moins", i, "fois"))
print(g)
}
}, movie.name = "fn_network.gif", interval = 1)
## MEDIAS
table(b$category)[ grepl("Médias", names(table(b$category))) ]
m = b[ grepl("Médias", b$category), ]
## explore with this
## m$who = gsub("(.*) sur (.*)", "\\1", m$title)
m$who = str_extract(m$title, "Marine (L|l)e Pen|Marion( |-)Maréchal|Nicolas Bay|Gollnisch|F(l)?(orian|\\.) (P|p)hilippot|Louis Aliot|Steeve Briois|Sébastien Chenu|Stéphane Ravier|Thibault de la Tocnaye|Wallerand de Saint( |-)Just|Marie-Christine Arnautu|Bousquet-Cassagne|Gilbert Collard|Julien Sanchez|Julien Rochedy|Jean-Marie Le Pen|Lacoste-Lareymondie|Philippe Murer|Sophie Montel|Laurent Lopez|Gaëtan Dussausaye|David Rachline|Bernard Monot|Aymeric Chauprade|Michel Guiniot|Edouard Cavin|Jean-Claude Otto-Bruc|Dominique Martin")
cat("MEDIAS: missing", sum(is.na(m$who)), "out of", nrow(m), "items\n")
m$who = tolower(gsub("-", " ", m$who))
m$who[ grepl("gollnisch", m$who) ] = "bruno gollnisch"
m$who[ grepl("philippot", m$who) ] = "florian philippot"
sort(unique(m$who))
# selected people
w = table(m$who)[ table(m$who) > 10 ]
m$who2 = ifelse(m$who %in% names(w), m$who, "(autres)")
m$who2[ is.na(m$who) ] = NA
# year-month
m$ym = paste0(year(m$date), "-", month(m$date))
# bars
qplot(data = m, fill = who2, x = date) +
theme_bw() +
theme(panel.grid.major.x = element_blank()) +
scale_fill_discrete("", na.value = NA) +
labs(y = "Nombre d'articles, catégorie 'Médias'\n", x = "\nAnnée-mois")
ggsave("fn_medias_bars.png", width = 9, height = 7)
m$bin = paste(year(m$date), str_pad(month(m$date), 2, pad = "0"), sep = "-")
mbins = group_by(m, bin) %>%
summarise(n = n(),
fp = sum(who2 == "florian philippot", na.rm = TRUE),
ratio = fp / n) %>%
mutate(year = str_sub(bin, 1, 4), month = str_sub(bin, start = -2))
# 2014-03 = municipal
# 2014-05 = EU
# 2015-03 = departmental
# 2015-12 = regional
ggplot(filter(mbins, !grepl("-0(7|8)$", bin)),
aes(y = ratio, x = month,
fill = !grepl("2014-(03|05)|2015-(03|12)", bin))) +
geom_bar(stat = "identity") +
scale_fill_brewer(palette = "Set2") +
guides(fill = FALSE) +
theme_bw() +
theme(panel.grid.major.x = element_blank()) +
scale_fill_discrete("", na.value = NA) +
labs(y = "Philippot / Autres, catégorie 'Médias'\n", x = "\nMois") +
facet_grid(~year, scales = "free_x", space = "free_x")
ggsave("fn_medias_philippot_bars.png", width = 9, height = 7)
m$bin = paste(year(m$date), str_pad(month(m$date), 2, pad = "0"), sep = "-")
mbins = group_by(m, bin) %>%
summarise(n = n(),
fp = sum(who2 == "nicolas bay", na.rm = TRUE),
ratio = fp / n) %>%
mutate(year = str_sub(bin, 1, 4), month = str_sub(bin, start = -2))
# 2014-03 = municipal
# 2014-05 = EU
# 2015-03 = departmental
# 2015-12 = regional
ggplot(filter(mbins, !grepl("-0(7|8)$", bin)),
aes(y = ratio, x = month,
fill = !grepl("2014-(03|05)|2015-(03|12)", bin))) +
geom_bar(stat = "identity") +
scale_fill_brewer(palette = "Set2") +
guides(fill = FALSE) +
theme_bw() +
theme(panel.grid.major.x = element_blank()) +
scale_fill_discrete("", na.value = NA) +
labs(y = "Bay / Autres, catégorie 'Médias'\n", x = "\nMois") +
facet_grid(~year, scales = "free_x", space = "free_x")
ggsave("fn_medias_bay_bars.png", width = 9, height = 7)
# lines
w = with(m, data.frame(ym = paste0(year(date), "-", sprintf("%02.0f", month(date))),
who = who2))
w = group_by(w, ym, who) %>% summarise(n = n())
qplot(data = w, x = ym, y = n, color = who, group = who, geom = "line") +
theme_bw()
# counts
filter(m, who2 != "(autres)") %>%
group_by(who2, ym) %>%
summarise(n = n()) %>%
group_by() %>%
arrange(-n)
## INTERVENTIONS
table(b$category)[ grepl("Interventions", names(table(b$category))) ]
i = b[ grepl("Interventions", b$category), ]
# different for Marion Maréchal-Le Pen; also added: Edouard Ferrand,
# Gilles Lebreton, Joëlle Mélin, Philippe Loiseau, Mireille d’Ornano
i$who = str_extract(i$title, "Marine (L|l)e Pen|Marion( |-)Mar(é|e)chal|Nicolas Bay|Gollnisch|F(l)?(orian|\\.) (P|p)hilippot|Louis Aliot|Steeve Briois|Sébastien Chenu|Stéphane Ravier|Thibault de la Tocnaye|Wallerand de Saint( |-)Just|Marie-Christine Arnautu|Bousquet-Cassagne|Gilbert Collard|Julien Sanchez|Julien Rochedy|Jean-Marie Le Pen|Lacoste-Lareymondie|Philippe Murer|Sophie Montel|Laurent Lopez|Gaëtan Dussausaye|David Rachline|Bernard Monot|Aymeric Chauprade|Michel Guiniot|Edouard Cavin|Jean-Claude Otto-Bruc|Dominique Martin|Edouard Ferrand|Gilles Lebreton|Joëlle Mélin|Philippe Loiseau|Mireille d’Ornano")
cat("INTERVENTIONS: missing", sum(is.na(i$who)), "out of", nrow(i), "items\n")
i$title[is.na(i$who)]
i$who = tolower(gsub("-", " ", i$who))
i$who[ grepl("gollnisch", i$who) ] = "bruno gollnisch"
i$who[ grepl("philippot", i$who) ] = "florian philippot"
sort(unique(i$who))
# selected people
w = table(i$who)[ table(i$who) > 5 ]
i$who2 = ifelse(i$who %in% names(w), i$who, "(autres)")
i$who2[ is.na(i$who) ] = NA
# year-month
i$ym = paste0(year(i$date), "-", month(i$date))
# bars
qplot(data = i, fill = who2, x = date) +
theme_bw() +
theme(panel.grid.major.x = element_blank()) +
scale_fill_discrete("", na.value = NA) +
labs(y = "Nombre d'articles, catégorie 'Interventions'\n", x = "\nAnnée-mois")
ggsave("fn_interventions_bars.png", width = 9, height = 7)
# lines
w = with(i, data.frame(ym = paste0(year(date), "-", sprintf("%02.0f", month(date))),
who = who2))
w = group_by(w, ym, who) %>% summarise(n = n())
qplot(data = w, x = ym, y = n, color = who, group = who, geom = "line") +
theme_bw()
# counts
filter(i, who2 != "(autres)") %>%
group_by(who2, ym) %>%
summarise(n = n()) %>%
group_by() %>%
arrange(-n)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment