Instantly share code, notes, and snippets.

Embed
What would you like to do?
scrape front covers from Charlie Hebdo – source: http://stripsjournal.canalblog.com/
#
# download 338 Charlie Hebdo covers with keywords
#
library(dplyr)
library(XML)
library(lubridate)
library(stringr)
library(ggplot2)
source("charlie-themes.r")
dir.create("pages", showWarnings = FALSE)
dir.create("covers", showWarnings = FALSE)
covers = "charlie.csv"
if(!file.exists(covers)) {
d = data.frame()
for(i in seq(330, 0, -10)) {
cat("Parsing page", sprintf("%2.0f", i / 10))
file = paste0("pages/page-", i, ".html")
if(!file.exists(file))
h = try(download.file(paste0("http://stripsjournal.canalblog.com/tag/Les%20Unes%20de%20Charlie%20Hebdo/p",
i, "-0.html"), file, mode = "wb", quiet = TRUE), silent = TRUE)
if(!file.info(file)$size) {
cat(": failed\n")
file.remove(file)
} else {
h = htmlParse(file)
addr = xpathSApply(h, "//div[@class='blogbody']//meta[@itemprop='url']/@content")
date = xpathSApply(h, "//div[@class='blogbody']//meta[@itemprop='dateCreated']/@content")
titl = xpathSApply(h, "//div[@class='blogbody']//h3", xmlValue)
dat = gsub("(.*) - (.*) - (.*)", "\\3", titl)
dat = gsub("(.*)- (.*)", "\\2", dat)
dat = gsub("octore", "octobre", dat)
dat = gsub("sept\\.", "sep.", dat)
dat = parse_date_time(gsub("\\.", "", dat), "%d %m %y", locale = "fr_FR")
dat = as.Date(dat)
# fix one unparsable date
dat[ addr == "http://stripsjournal.canalblog.com/archives/2012/11/14/26045046.html" ] =
as.Date("2012-11-14")
file = gsub("(.*) - (.*) - (.*)", "\\2 - \\1", titl)
file = gsub("Charlie Hebdo Nª", "", file)
file = paste(dat, file)
kwd = xpathSApply(h, "//div[@class='blogbody']//h3/following-sibling::div[@class='itemfooter'][1]")
kwd = lapply(kwd, xpathSApply, "a[@rel='tag']", xmlValue)
# fix a few keywords
kwd = lapply(kwd, function(x) {
y = tolower(x[ !x %in% c("Charlie Hebdo", "Les Unes de Charlie Hebdo") ])
y[ y == "aubry" ] = "martine aubry"
y[ y == "crise économique 2009..." ] = "crise économique 2009"
y[ y == "tapie" ] = "bernard tapie"
y[ y == "emmigration" ] = "émigration"
y[ y == "univercité" ] = "université"
return(gsub("œ", "oe", y))
})
kwd = sapply(kwd, paste0, collapse = ";")
aut = xpathSApply(h, "//div[@class='blogbody']//h3/following-sibling::div[@class='itemfooter'][2]")
aut = lapply(aut, xpathSApply, "a[contains(@href, 'archives')]", xmlValue)
aut = sapply(aut, head, 1)
# img = xpathSApply(h, "//div[@class='blogbody']//h3/following-sibling::p//img/@src")
img = xpathSApply(h, "//div[@class='blogbody']//h3/following-sibling::div[@class='itemfooter'][1]/following-sibling::p[2]")
img = sapply(img, xpathSApply, "a/img/@src")
img = sapply(img, function(x) ifelse(is.null(x), NA, x))
# fix six parser errors
img[ addr == "http://stripsjournal.canalblog.com/archives/2012/04/30/24141374.html" ] =
"http://p0.storage.canalblog.com/06/27/177230/75268181.jpg"
img[ addr == "http://stripsjournal.canalblog.com/archives/2013/11/20/28478698.html" ] =
"http://p2.storage.canalblog.com/22/38/177230/91674307.jpg"
img[ addr == "http://stripsjournal.canalblog.com/archives/2013/11/12/28419822.html" ] =
"http://p1.storage.canalblog.com/15/42/177230/91450058_o.jpg"
img[ addr == "http://stripsjournal.canalblog.com/archives/2013/11/05/28369503.html" ] =
"http://p2.storage.canalblog.com/28/47/177230/91251269_o.jpg"
img[ addr == "http://stripsjournal.canalblog.com/archives/2013/10/29/28316947.html" ] =
"http://p4.storage.canalblog.com/44/10/177230/91032930_o.jpg"
img[ addr == "http://stripsjournal.canalblog.com/archives/2015/01/12/31306656.html" ] =
"http://p6.storage.canalblog.com/68/20/177230/101510919_o.png"
file = paste0(file, gsub("(.*)\\.(gif|jpg|png)", ".\\2", img))
d = rbind(d, data.frame(post_page = i,
post_url = addr, post_date = date, post_title = titl,
date = dat, tags = kwd, author = aut, image = img,
file, stringsAsFactors = FALSE))
cat(":", sprintf("%3.0f", nrow(d)), "total covers\n")
}
}
# fix three missing authors
d$author[ d$author == "Index Dessinateurs" ] = NA
d$author[ d$post_url == "http://stripsjournal.canalblog.com/archives/2009/10/28/30971837.html" ] =
"Luz"
d$author[ d$post_url == "http://stripsjournal.canalblog.com/archives/2014/05/21/30964112.html" ] =
"Cabu"
d$author[ d$post_url == "http://stripsjournal.canalblog.com/archives/2014/05/14/30964082.html" ] =
"Cabu"
write.csv(d, covers, row.names = FALSE)
}
d = read.csv(covers, stringsAsFactors = FALSE)
for(i in which(!is.na(d$image))) {
file = paste0("covers/", d$file[ i ])
if(!file.exists(file))
try(download.file(d$image[ i ], file, quiet = TRUE), silent = TRUE)
}
# dates
print(table(year(d$date), exclude = NULL))
d$year = year(d$date)
d$quarter = paste0(d$year, "_", quarter(d$date))
# numérotation
d$num = str_extract(d$post_title, "°\\d+")
d$num[ is.na(d$num) ] = str_extract(d$post_title[ is.na(d$num) ], "Charlie Hebdo \\d+")
d$num = gsub("\\D", "", d$num)
stopifnot(n_distinct(d$num) == nrow(d))
# authors
print(table(d$author, exclude = NULL))
# keywords
terms = unlist(strsplit(d$tags, ";"))
cat("\n", length(unique(terms)), "unique keywords:\n")
print(table(terms)[ table(terms) > quantile(table(terms), .99) ])
# themes
stopifnot(!length(terms[ is.na(themes(terms)) ]))
# complete edge list
full = data.frame()
for(i in 1:nrow(d)) {
y = unlist(strsplit(d$tags[ i ], ";"))
full = rbind(full, data.frame(
expand.grid(i = y, j = y, stringsAsFactors = FALSE),
author = d$author[ i ],
quarter = d$quarter[ i ],
weight = 1 / (1 + str_count(d$tags[ i ], ";")) # inverse weighting
))
}
csv = data.frame(tag = unique(terms), theme = themes(unique(terms))) %>%
arrange(tag)
head(csv)
csv$nums = NA
K = strsplit(d$tags, ";")
for(i in csv$tag) {
k = lapply(K, function(x) i %in% x)
k = as.numeric(d$num[ which(sapply(k, isTRUE)) ])
if(length(k))
csv$nums[ csv$tag == i ] = paste0(k [ order(k) ], collapse = ";")
}
write.csv(csv, "charlie-tags.csv", row.names = FALSE)
#
# Thematic plots
#
th = c(
"Armée/Police" = "Varia",
"Capitalisme" = "Varia",
"France" = "Varia",
"International" = "International",
"Medias" = "Varia",
"Politique" = "Politique",
"Religion" = "Religion",
"Showbiz" = "Varia",
"Terrorisme" = "Violence",
"Varia" = "Varia",
"Violence" = "Violence"
)
yy = lapply(d$tags, function(x) {
y = unlist(strsplit(x, ";"))
y = th[ themes(y, simplify = F) ]
unique(y)
})
dd = data.frame()
for(i in unique(d$year)) {
y = unlist(yy[ d$year == i ])
print(i)
print(table(y))
dd = rbind(dd, data.frame(year = i, theme = names(table(y)),
freq = as.vector(table(y)),
prop = as.vector(table(y) / sum(table(y)))))
}
colors = c(
"Politique" = "#80b1d3", # Set3 / light blue
"International" = "#fdb462", # Set3 / light orange
"Religion" = "#4daf4a", # Set1 / bright green
"Terrorisme" = "#e41a1c", # Set1 / bright red
"Varia" = "#999999", # Set1 / grey
"Violence" = "#984ea3" # Set1 / bright purple
)
qplot(data = dd, fill = theme, y = prop, x = factor(year), stat = "identity", geom = "bar") +
scale_fill_manual("Thème", values = colors2) +
theme_bw() +
theme(panel.grid = element_blank()) +
labs(y = "Fréquence relative\n", x = "\nAnnée")
ggsave("plots/charlie_themes_prop.pdf", width = 9, height = 8)
qplot(data = dd, fill = theme, y = freq, x = factor(year), stat = "identity", geom = "bar") +
scale_fill_manual("Thème", values = colors2) +
theme_bw() +
theme(panel.grid = element_blank()) +
labs(y = "Fréquence brute\n", x = "\nAnnée")
ggsave("plots/charlie_themes_freq.pdf", width = 9, height = 8)
# done
#
# quick and dirty ERGM specification for the co-occurrence network
# NB: to be run after the exploratory models in charlie-models.r
#
# edge list
e = subset(full[, 1:2 ], i != j)
e$u = apply(e, 1, function(x) paste0(sort(x), collapse = "///"))
e = data.frame(table(e$u))
e = data.frame(i = gsub("(.*)///(.*)", "\\1", e$Var1),
j = gsub("(.*)///(.*)", "\\2", e$Var1),
n = e$Freq / 2)
# network
n = network(e[, 1:2 ], directed = FALSE)
t = themes(network.vertex.names(n), simplify = FALSE)
n %n% "covers" = 338
n %v% "theme" = t
set.edge.attribute(n, "weight", ifelse(e$n > 10, 10, e$n))
print(table(n %e% "weight"))
# > subset(e, n > 10)
# i j n
# 779 élections présidentielle 23
# 781 élections présidentielle 2012 22
# 789 élections sarkozy 17
# 871 extrême droite fn 17
# 880 extrême droite marine le pen 14
# 961 fn marine le pen 15
# 1189 intégrisme religion 17
# 1563 présidentielle 2012 sarkozy 15
# 1571 présidentielle présidentielle 2012 22
# 1575 présidentielle sarkozy 15
th = c(
"Armée/Police" = "Varia",
"Capitalisme" = "Varia",
"France" = "Varia",
"International" = "International",
"Medias" = "Varia",
"Politique" = "Politique",
"Religion" = "Religion",
"Showbiz" = "Varia",
"Terrorisme" = "Violence",
"Varia" = "Varia",
"Violence" = "Violence"
)
# collapse themes
n %v% "theme" = th[ n %v% "theme" ]
print(table(n %v% "theme", exclude = NULL))
E = ergm(n ~ nodemix("theme"))
b = which(names(coef(E)) %in% c("mix.theme.France.France",
"mix.theme.International.International",
"mix.theme.Politique.Politique",
"mix.theme.Religion.Religion",
"mix.theme.Showbiz.Showbiz",
"mix.theme.Terrorisme.Terrorisme",
"mix.theme.Varia.Varia",
"mix.theme.Violence.Violence"))
b = c(b, which(grepl("Varia", names(coef(E)))))
#
# Baseline model
#
B = ergm(n ~ edges +
nodefactor("theme", base = 4) +
nodematch("theme", diff = TRUE, keep = c(1:3, 5)) +
nodemix("theme", base = b),
parallel = 4,
control = control.ergm(seed = 4575,
MCMLE.termination = "precision",
MCMLE.effectiveSize = 50, # enable adaptive burnin and interval
MCMLE.MCMC.precision = .01, # control how much noise is tolerable
MCMLE.maxit = 50))
print(summary(B))
#
# GWDSP-adjusted model
#
D = ergm(n ~ edges +
nodefactor("theme", base = 4) +
nodematch("theme", diff = TRUE, keep = c(1:3, 5)) +
nodemix("theme", base = b) +
gwdsp(1, fixed = TRUE),
parallel = 4,
control = control.ergm(seed = 4575,
MCMLE.termination = "precision",
MCMLE.effectiveSize = 50, # enable adaptive burnin and interval
MCMLE.MCMC.precision = .01, # control how much noise is tolerable
MCMLE.maxit = 50))
print(summary(D))
#
# GWD-adjusted models, at various values of alpha decay
#
a = seq(0, 5, by = 0.1)
alphas = c()
L = list()
for(i in a) {
cat("\nAttempting to fit at decay alpha =", i, "\n")
E = try(ergm(n ~ edges +
nodefactor("theme", base = 4) +
nodematch("theme", diff = TRUE, keep = c(1:3, 5)) +
nodemix("theme", base = b) +
gwdegree(i, fixed = TRUE), # gwesp(1, fixed = TRUE), gwdsp(1, fixed = TRUE),
parallel = 4,
control = control.ergm(seed = 4575,
MCMLE.termination = "precision",
MCMLE.effectiveSize = 50, # enable adaptive burnin and interval
MCMLE.MCMC.precision = .01, # control how much noise is tolerable
MCMLE.maxit = 50)), silent = TRUE)
if(!"try-error" %in% class(E)) {
print(summary(E))
L[[ which(a == i) ]] = E
alphas = c(alphas, i)
}
}
L = L[ !sapply(L, is.null) ]
r = data.frame()
for(i in 1:length(L)) {
r = rbind(r, data.frame(
x = names(coef(L[[ i ]])),
b = summary(L[[ i ]])$coefs[, 1],
se = summary(L[[ i ]])$coefs[, 2],
alpha = alphas[ i ],
stringsAsFactors = FALSE))
}
r = bind_rows(r)
r$ub = r$b + 2 * r$se
r$lb = r$b - 2 * r$se
# see how the GWD term moves: a < 1 and a > 3 are alright
qplot(data = subset(r, grepl("gwd", x)),
x = factor(alpha), y = b, ymin = lb, ymax = ub, geom = "pointrange") +
geom_hline(y = 0, lty = "dashed") +
facet_wrap(~ alpha, scales = "free", nrow = 1) +
labs(y = "Coefficient estimate\n", x = "\nGWD alpha decay parameter") +
theme_bw() +
theme(panel.grid = element_blank())
r = subset(r, alpha < 1 | alpha > 3)
qplot(data = subset(r, grepl("nodefactor", x)),
x = factor(alpha), y = b, ymin = lb, ymax = ub, geom = "pointrange") +
geom_hline(y = 0, lty = "dashed") +
facet_wrap(~ x) +
theme_bw()
qplot(data = subset(r, grepl("nodematch", x)),
x = factor(alpha), y = b, ymin = lb, ymax = ub, geom = "pointrange") +
geom_hline(y = 0, lty = "dashed") +
facet_wrap(~ x) +
theme_bw()
qplot(data = subset(r, grepl("mix", x)),
x = factor(alpha), y = b, ymin = lb, ymax = ub, geom = "pointrange") +
geom_hline(y = 0, lty = "dashed") +
facet_wrap(~ x) +
labs(y = "Coefficient estimate\n", x = "\nGWD alpha decay parameter") +
theme_bw() +
theme(panel.grid = element_blank())
save(B, D, L, alphas, n, file = "charlie-ergm.rda")
# have a nice day
#
# quick and dirty ERGM specifications for the co-occurrence networks
#
library(ergm)
library(ergm.count)
coefs = data.frame()
for(i in ls(pattern = "^net_")) {
nn = get(i)
# collapse general themes
nn %v% "theme" = ifelse(nn %v% "theme" %in% c("France", "International"), "Varia", nn %v% "theme")
cat("\n", i, ":\n\n")
print(table(nn %v% "theme", exclude = NULL))
E = ergm(nn ~ nodemix("theme"))
b = which(names(coef(E)) %in% c("mix.theme.France.France", # useless
"mix.theme.International.International",
"mix.theme.Religion.Religion",
"mix.theme.Terrorisme.Terrorisme",
"mix.theme.Varia.Varia",
"mix.theme.Violence.Violence"))
b = c(b, which(grepl("Varia", names(coef(E)))))
if(!exists(paste0("ergm_", i))) {
# binary model for all networks
E = ergm(nn ~ edges + nodefactor("theme") + nodemix("theme", base = b),
control = control.ergm(seed = 4575,
MCMLE.termination = "precision",
MCMLE.effectiveSize = 50, # enable adaptive burnin and interval
MCMLE.MCMC.precision = .01, # control how much noise is tolerable
MCMLE.maxit = 50))
# add weighted model for author-specific networks
if(!grepl("\\d", i)) {
W = try(ergm(nn ~ nonzero + sum + nodefactor("theme") + nodemix("theme", base = b),
response = "weight", reference = ~ Poisson,
control = control.ergm(seed = 4575,
MCMC.prop.weights = "0inflated",
MCMLE.trustregion = 1000,
MCMLE.termination = "precision",
MCMLE.effectiveSize = 50, # enable adaptive burnin and interval
MCMLE.MCMC.precision = .01, # control how much noise is tolerable
MCMLE.maxit = 50)))
assign(paste0("wergm_", i), W)
}
} else {
E = get(paste0("ergm_", i))
}
assign(paste0("ergm_", i), E)
coefs = rbind(coefs, data.frame(
x0 = "B",
network = gsub("net_", "", i),
x = names(coef(E)),
b = coef(E),
se = summary(E)$coefs[ , 2 ],
stringsAsFactors = FALSE
))
if(exists(paste0("wergm_", i))) {
W = get(paste0("wergm_", i))
if(!"try-error" %in% class(W))
coefs = rbind(coefs, data.frame(
x0 = "W",
network = gsub("net_", "", i),
x = names(coef(W)),
b = coef(W),
se = summary(W)$coefs[ , 2 ],
stringsAsFactors = FALSE
))
}
}
#
# temporal ERGM results
#
qplot(data = subset(coefs, grepl("20(09|10|11|12|13|14)", network) &
x0 == "B" & grepl("mix", x) & !is.infinite(b) & se < 10),
y = b, x = x0, color = x) +
geom_segment(aes(xend = x0, y = b - 2 * se, yend = b + 2 *se)) +
geom_hline(y = 0, lty = "dashed") +
facet_grid(x ~ network) +
scale_color_brewer("Coefficients", palette = "Set1") +
labs(x = NULL, y = "log Pr( co-occurrence )\n") +
theme_bw() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
panel.grid = element_blank(),
legend.position = "bottom")
ggsave("ergm_time.pdf", width = 9, height = 8)
#
# author-specific ERGM results, binary and valued
#
coefs$x = gsub("sum\\.", "", coefs$x)
qplot(data = subset(coefs, grepl("Cabu|Charb|Riss|Luz", network) &
grepl("mix", x) & !is.infinite(b) & se < 10),
y = b, x = x0, color = x) +
geom_segment(aes(xend = x0, y = b - 2 * se, yend = b + 2 *se)) +
geom_hline(y = 0, lty = "dashed") +
facet_grid(x ~ network) +
scale_color_brewer("Coefficients", palette = "Set1") +
labs(x = NULL, y = "log Pr( co-occurrence )\n") +
theme_bw() +
theme(#axis.text.x = element_blank(),
#axis.ticks.x = element_blank(),
panel.grid = element_blank(),
legend.position = "bottom")
ggsave("ergm_authors.pdf", width = 9, height = 8)
# kthxbye
#
# build temporal and author-specific co-occurrence networks
#
library(animation)
library(dplyr)
library(GGally)
library(ggplot2)
library(network)
source("charlie-data.r")
dir.create("plots", showWarnings = FALSE)
colors = c(
"France" = "#80b1d3", # Set3 / light blue
"International" = "#fdb462", # Set3 / light orange
"Religion" = "#4daf4a", # Set1 / bright green
"Terrorisme" = "#e41a1c", # Set1 / bright red
"Varia" = "#999999", # Set1 / grey
"Violence" = "#984ea3" # Set1 / bright purple
)
#
# time-specific term networks
#
prop = data.frame()
for(k in names(table(d$quarter)[ table(d$quarter) > 10 | names(table(d$quarter)) == "2015_1" ])) {
# remove self-loops and select author
e = subset(full, i != j & quarter == k)[, 1:2 ]
cat("Building:", k, nrow(e), "edges\n")
# aggregate similar rows
e$u = apply(e, 1, function(x) paste0(sort(x), collapse = "///"))
e = data.frame(table(e$u))
e = data.frame(i = gsub("(.*)///(.*)", "\\1", e$Var1),
j = gsub("(.*)///(.*)", "\\2", e$Var1),
n = e$Freq / 2)
n = network(e[, 1:2 ], directed = FALSE)
t = themes(network.vertex.names(n))
n %n% "period" = k
n %v% "theme" = t
set.edge.attribute(n, "weight", e$n)
## print(table(n %e% "weight"))
# compare network dimensions
nv0 = network.size(n)
ne0 = network.edgecount(n)
nd = network.density(n)
g = ggnet(n, label.nodes = network.vertex.names(n),
node.group = t, node.color = colors[ names(colors) %in% t ],
size = 0, label.size = 4, segment.color = "grey50",
segment.alpha = .5) +
theme(legend.key = element_blank(), legend.position = "bottom") +
ggtitle(paste("Période", gsub("_", "-", k),
"n =", table(d$quarter)[ names(table(d$quarter)) == k ],
"'unes'", nv0, "mots-clés", ne0, "co-occurrences\n"))
ggsave(paste0("plots/network_", k, ".pdf"), g, width = 12, height = 12)
assign(paste0("plot_", k), g)
assign(paste0("net_", k), n)
assign(paste0("edges_", k), e)
# subset to selected themes
t = t %in% c("Religion", "Terrorisme", "Violence")
delete.vertices(n, which(!t))
prop = rbind(prop, data.frame(t = k,
n = table(d$quarter)[ names(table(d$quarter)) == k ],
d = nd,
nv0, nv1 = network.size(n),
ne0, ne1 = network.edgecount(n),
stringsAsFactors = FALSE))
print(tail(prop, 1))
}
stop()
# dimensions
summary(prop$d)
prop$pv = with(prop, nv1 / nv0)
prop$pe = with(prop, ne1 / ne0)
# plot
prop$year = substr(prop$t, 1, 4)
prop$quarter = substr(prop$t, 6, 7)
prop = arrange(prop, -pe) %>%
mutate(color = 1:n()) %>%
mutate(color = cut(color, breaks = c(1, 3, 7, Inf), include.lowest = TRUE))
qplot(data = subset(prop, year %in% 2009:2014),
x = quarter, y = pe, group = year, fill = color,
stat = "identity", geom = "bar") +
geom_hline(y = prop$pe[ prop$t == "2015_1" ], lty = "dashed") +
facet_wrap(~ year, nrow = 1) +
scale_fill_manual("", values = c("[1,3]" = "#2c7fb8", "(3,7]" = "#7fcdbb", "(7,Inf]" = "#edf8b1")) +
guides(fill = FALSE) +
labs(y = "Proportion des liens 'Religion, Terrorisme, Violence'\n",
x = "\nPériode annuelle : 1. janvier-mars 2. avril-juin 3. juillet-septembre 4. octobre-décembre") +
theme_bw() +
theme(panel.grid.major.x = element_blank())
ggsave("plots/prop_edges.pdf", width = 9, height = 7)
prop = arrange(prop, -pv) %>%
mutate(color = 1:n()) %>%
mutate(color = cut(color, breaks = c(1, 3, 7, Inf), include.lowest = TRUE))
qplot(data = subset(prop, year %in% 2009:2014),
x = quarter, y = pv, group = year, fill = color,
stat = "identity", geom = "bar") +
geom_hline(y = prop$pv[ prop$t == "2015_1" ], lty = "dashed") +
facet_wrap(~ year, nrow = 1) +
scale_fill_manual("", values = c("[1,3]" = "#2c7fb8", "(3,7]" = "#7fcdbb", "(7,Inf]" = "#edf8b1")) +
guides(fill = FALSE) +
labs(y = "Proportion des mots-clés 'Religion, Terrorisme, Violence'\n",
x = "\nPériode annuelle : 1. janvier-mars 2. avril-juin 3. juillet-septembre 4. octobre-décembre") +
theme_bw() +
theme(panel.grid.major.x = element_blank())
ggsave("plots/prop_nodes.pdf", width = 9, height = 7)
# movie
saveGIF({
for(i in sort(ls(pattern = "plot_\\d+")))
print(get(i))
}, movie.name = "network_quarters.gif", ani.width = 600, ani.height = 600)
#
# author-specific term networks
#
for(k in names(table(d$author)[ table(d$author) > 50 ])) {
# remove self-loops and select author
e = subset(full, i != j & author == k)[, 1:2 ]
cat("Building:", k, nrow(e), "edges\n")
# aggregate similar rows
e$u = apply(e, 1, function(x) paste0(sort(x), collapse = "///"))
e = data.frame(table(e$u))
e = data.frame(i = gsub("(.*)///(.*)", "\\1", e$Var1),
j = gsub("(.*)///(.*)", "\\2", e$Var1),
n = e$Freq / 2)
n = network(e[, 1:2 ], directed = FALSE)
t = themes(network.vertex.names(n))
n %n% "period" = k
n %v% "theme" = t
set.edge.attribute(n, "weight", e$n)
## print(table(n %e% "weight"))
assign(paste0("plot_", k), g)
assign(paste0("net_", k), n)
assign(paste0("edges_", k), e)
ggnet(n, label.nodes = network.vertex.names(n),
node.group = t, node.color = colors[ names(colors) %in% t ],
size = 0, label.size = 4, segment.color = "grey50",
segment.alpha = .5) +
theme(legend.key = element_blank(), legend.position = "bottom") +
ggtitle(paste(k, ", n =", table(d$author)[ names(table(d$author)) == k ],
"'unes'", network.size(n), "mots-clés",
network.edgecount(n), "co-occurrences\n"))
ggsave(paste0("plots/network_", k, ".pdf"), width = 12, height = 12)
}
# done
tag theme nums
11 septembre Terrorisme 848
14 juillet Varia 996
2022 Varia 1177
absentéisme Varia 991
afghanistan International 850
agriculture Varia 974;1116
ahmadinejad International 879
air france Varia 885
airbus Varia 885
al-qaida Terrorisme 985
alcoolisme Varia 935
algérie International 971
alimentation Varia 1079
allègre France 929
allemagne International 1039;1065;1110
amende Varia 884
anne sinclair France 990;994
anniversaire Varia 881;933
antisémitisme Religion 879;977;1013;1031;1124;1125;1153
armée Varia 850;992;996;1075;1099;1106
armes Varia 1133
assassinat Violence 1031;1032;1062
astérix Varia 1061
athées Religion 847;983
attentats Violence 1118
aviation Varia 885;890
avion Varia 885;890;1154
bac Varia 887;991;1096
bachar al-assad International 1026;1106
ballerine Varia 1005
banditisme Varia 1107;1109
banques Varia 849;852;1013;1044;1102;1179
bateau Varia 1022
batman Varia 1049
bayrou France 886;898
belgique International 1056;1069
ben ali International 976
ben laden Terrorisme 959;985
benoit xvi Religion 847;877;928;962;1041;1070;1078;1080;1081
berlusconi International 1081
bernadette chirac France 1128
bernard arnault Varia 1056
bernard tapie France 1096;1102
besson France 880;900;906;908;914;921;952
bettencourt France 942;944;945
bible Religion 983
bigard France 848
bizutage Varia 1039
bnp Varia 1013
bonne année Varia 915
bonus Varia 895
borloo France 801
bourse Varia 849;851;895;999
boutin France 1092
brésil International 862;873;1101
bretagne France 1116;1117
budget Varia 1140
burka Religion 888;920
bush International 855
ça déraille Varia 1100
capitalisme Varia 876
carbone Varia 899
caricature Varia 1057
carla bruni France 914;927;931;946;1003;1006;1027;1030;1161
catastrophes naturelles Varia 918;978
cavanna Varia 1129
cécile duflot France 1132
charia Religion 1010;1011
charter Varia 950
chirac France 976;1000;1004
chômage Varia 850;868;937;1048;1079
christiane taubira France 1132
cinéma Varia 831;928;999;1091;1119;1151
clearstream France 901
climat Varia 924;925;929
clonage Varia 912
closer France 1174
clowns Varia 1168
cocaïne Varia 882
cohn-bendit France 886
colonialisme Varia 1153
coluche Varia 852;1171
concept cars Varia 957
consommation Varia 1018
copé France 1040;1053;1063;1066;1067;1070;1094;1105
coran Religion 983;1099
corruption Varia 973
corse France 1062
crash test Varia 885;890;1154
crédit lyonnais Varia 1102
crise Varia 849;851;852;853;860;868;872;877;891;896;937;999;1047;1052;1090;1100;1140;1152;1157
crise économique 2009 Varia 868;872;877;891;896;937
crise financière 2008 Varia 849;851;852;853;860
croissance Varia 870;896;1104;1125;1152;1157
crs Varia 949;958
darcos France 910
dati France 854
de gaulle France 939;1093
delarue France 1054
délation Varia 880
démocratie Varia 1094
derrick Varia 861
dieu Religion 1064;1108
dieudonné France 1124;1125
dimanche Varia 1111
discrimination Violence 966
disparus Varia 885
drh Varia 903
drogue Varia 1014;1054
droite France 887;926;1093
dsk France 968;987;989;990;993;994;998;1005;1010;1015;1051;1068;1081;1086;1134;1143;1160
durban International 879
ebola Varia 1156
école Varia 1073;1108;1130;1142
écologie Varia 801;899;982;1116
économie Varia 870;872;895;923;934;1104;1110;1125;1140;1141;1152;1157
écoute Varia 1003
education nationale Varia 865;883;1142
egypte International 972;1045;1099
élections Varia 729;831;846;855;856;885;886;925;932;970;978;980;982;994;1002;1007;1009;1020;1023;1024;1025;1027;1028;1029;1031;1032;1033;1034;1035;1036;1037;1038;1042;1043;1063;1066;1067;1094;1110;1135;1136;1177
elvis Varia 985
elysée France 1176
émigration Varia 1112
enseignement Varia 865;883
entreprises Varia 910;1141
environnement Varia 801;899;929;1116;1117
épidémie Varia 892;909;1156
essence Varia 957;1001
été Varia 890;891;1048;1103;1104;1156;1157
europe Varia 885;934;953;1013;1017;1039;1065;1112;1144
européennes France 885;886
eva joly France 996
évêque Religion 873
examen Varia 883
expulsions Varia 1114
extrême droite France 921;926;932;965;978;984;997;1028;1029;1031;1036;1043;1066;1070;1095;1109;1112;1135;1147;1172;1174
fabius France 1087;1089
facebook Varia 935
famille Varia 1164
farine animale Varia 1079
femen Varia 1053;1081
figaro Varia 1113
fillon France 868;926;961;964;996;1014;1040;1053;1063;1066;1067;1070;1105;1169
florange Varia 1059;1068
florian philippot France 1174
fmi Varia 987;989
fn France 921;926;932;978;979;984;997;1031;1036;1043;1066;1095;1109;1112;1135;1136;1144;1145;1147;1150;1165;1172;1174
foot Varia 937;938;939;940;986;1115;1148;1149
france Varia 918;1039;1065;1106;1162
france télécom Varia 902
france télévision Varia 864
françois 1er Religion 1082;1083;1101;1179
françois hollande France 846;1008;1009;1023;1033;1038;1039;1044;1045;1047;1048;1052;1055;1061;1072;1075;1086;1089;1090;1091;1098;1103;1107;1110;1114;1119;1121;1122;1123;1125;1126;1132;1134;1139;1146;1152;1157;1160;1171;1176
françois mitterrand France 969;1000
frédéric mitterrand France 904
frigide barjot France 1088;1092
galliano France 977
gauche France 887;1047;1138
gaza International 863;865
georges lautner France 1119
gérard depardieu France 1069;1070;1133
gigolo Varia 942
glou glou Varia 1103;1112
grèce International 1052
grenelle France 801
grève Varia 875;957;1151
grippe Varia 892;909
guadeloupe France 871
guéant France 1014;1025;1029;1040;1053
guerre Violence 850;894;939;979;985;1001;1075;1106;1107;1155
haïti International 918
hara-kiri Varia 1129
hitler Varia 1013
homophobie Violence 1012;1046;1064;1073;1074;1088;1092;1093
hortefeux France 900;948;1040;1053;1105
houellebecq France 1177
hulot France 982
humanisme Varia 991
immigration Varia 880;906;946;1112
impôts Varia 853;1024;1056;1069;1086;1106;1113;1115;1116;1117
incendie criminel Violence 1012
intégrisme Religion 866;888;917;920;959;1011;1012;1016;1049;1057;1058;1070;1073;1075;1099;1130;1142;1143;1162;1163;1166;1170
intempéries Varia 1097
internet Varia 935
irak International 1170
iron man Varia 1091
israël International 863;865;1153
israël-palestine International 863;865;866;1153
japon International 978
jean-luc mélenchon France 1033;1042
jean-marc ayrault France 1055;1072;1087;1089;1122;1132
jean-pierre pernaut France 919;960
jean-yves le drian France 1132
jeanne d'arc Varia 1145
jérôme cahuzac France 1086;1087
jesus Religion 905;966;983;1016;1064;1082;1084;1173
jihad Terrorisme 1170
jo Varia 1049;1050;1131
johnny hallyday France 913;936
journal Varia 1113;1118
journalisme Varia 1050
journée de... Varia 924;1004
juppé France 1014
justice Varia 901;945;974;976;987;990;1071;1102
kadhafi International 975;976;1001
kouchner France 961
krach Varia 999
l'oréal Varia 944;945
la poste Varia 903
lagarde France 849;989
laïcité Religion 981;1108
lcl Varia 1102
le cimetière Varia 853;861;889;930;985;1000;1054;1129
le monde Varia 1003
le pen France 926;965;1028;1029;1031;1109;1147;1150;1165;1172
législatives France 1042;1043;1110
les dictatures International 975
les firmes Varia 902;944;945;1021;1022;1085;1111
les riches Varia 1056
les verts France 996
libération Varia 1118
libye International 975;979;1001;1010;1011
livres Varia 1030;1177
logement Varia 859
lois Varia 1046
lourdes Religion 1097
luc chatel France 1040
luc ferry France 991
madonna France 1042
mahomet Religion 1163;1178
malaise Varia 893
mali International 1074;1075
mam France 971;973;975
manifestations Varia 867;874;875;949;951;956;957;975;992;1073;1074;1099;1113;1164
manuel valls France 1089;1132;1137;1138;1139;1140;1158
mariage Varia 1046;1074;1076;1088;1092;1093
marine le pen France 965;970;977;978;979;984;997;1028;1029;1031;1042;1043;1066;1070;1095;1112;1144;1145;1147;1150;1165;1167;1171;1172;1179
maroc International 971
marseille France 1107
martine aubry France 858;898;968;988;995;1008
massacre Violence 1178
mc donald's Varia 1168
medef Varia 875
médicament Varia 743;916
mer Varia 890;891;895;944
merkel International 1039;1065;1110
michael jackson France 889
michel sapin France 1132
ministre France 1137
mireille mathieu France 1059
miss france France 964;1120
mitt romney International 1063
mode Varia 977
mohamed merah Terrorisme 1060
montebourg France 1089;1132;1141;1158
moubarak International 976
municipales France 1094;1135;1136
nadine morano France 1014;1021
najat belkacem France 1159
nathalie kosciusko-morizet France 1094
nazisme Varia 861;1013
négationnisme Varia 869;879
népotisme Varia 904
nice France 1109
nigeria International 1143
noël Varia 858;860;914;966;1018;1173
norvège International 997
nucléaire Varia 729
nudisme Varia 1132
obama International 846;855;866;1107
opus dei Religion 905
otage Terrorisme 987
ouverture Varia 887;897;927
palestine International 863;865;1153
pape Religion 847;869;877;928;962;1041;1078;1080;1081;1082;1083;1101;1179
paris France 1094
parisot Varia 875
parricide Violence 1076
patrick buisson France 1134
patrimoine Varia 1087
pédophilie Violence 873;894;928;1073
pétrole Varia 1001
phallocrate Varia 1127
pharmaceutique Varia 916
philosophie Varia 887;991
pierre moscovici France 1089
pip Varia 1021;1022
pirates Varia 878
pitbulls Varia 961
plage Varia 890;891;944;997
pma Varia 1077
polanski France 928
police Varia 884;949;958
pollution Varia 1116
poséidon adventure Varia 1022
poutine International 1053;1131;1133
préservatif Varia 877;962;994
président France 1126;1149
présidentielle France 932;970;980;982;994;1002;1009;1020;1023;1024;1025;1027;1028;1029;1031;1032;1033;1034;1035;1036;1037;1038;1063
présidentielle 2007 France 729
présidentielle 2012 France 932;970;980;982;994;1002;1009;1020;1023;1024;1025;1027;1028;1029;1031;1032;1033;1034;1035;1036;1037;1038
présidentielle 2017 France 1161
presse Varia 1003;1118;1174
primaires France 1007;1094
prisons Varia 854;1053;1175
privatisation Varia 903
prof Varia 883
prophète Religion 905;966;983;1016;1064;1082;1084;1163;1178
prostitution Varia 1120
provence-alpes-côte d'azur France 1107;1109
ps France 831;857;888;968;988;1007;1009;1055;1086;1087;1132;1158
pub Varia 861;864;982
pussy riot International 1053
qatar International 1059
radio Varia 941
radio france Varia 941
rama yade France 908;961
ramadan Religion 1177
raymond domenech France 940
recyclage Varia 1091
réforme Varia 876
régime Varia 963
régionales France 925
régions françaises France 1116;1117
religion Religion 847;873;888;917;920;928;962;981;983;1011;1012;1016;1049;1057;1058;1064;1073;1078;1080;1082;1083;1097;1099;1101;1108;1111;1130;1142;1143;1162;1163;1166;1173;1177;1178
remaniement Varia 954;1132;1158
rentrée Varia 1002
répression Varia 975;1026
retraite Varia 922;930;935;949;950;951;955;1078
révolution Varia 972;975;1026
rigueur Varia 1090
rio International 1101
rolex France 877
roselyne bachelot France 909
roumanie International 950
rumeur Varia 995;998
russie International 1131;1133
santé Varia 892;893;909;911;935;1021;1079;1117;1121
sarkozy France 743;858;859;862;863;867;868;870;871;872;874;878;881;882;884;893;899;901;904;907;912;914;915;919;921;922;923;925;927;931;933;941;943;944;946;950;953;954;957;964;967;970;974;979;980;1000;1002;1003;1004;1006;1007;1009;1019;1020;1023;1024;1025;1026;1027;1030;1032;1034;1035;1036;1037;1038;1051;1084;1096;1105;1121;1134;1137;1151;1161;1162;1167;1169;1171;1179
science Varia 917;929
sdf Varia 859;860;874
sécurité Varia 946
sécurité sociale Varia 936
ségolène royal France 729;831;856;857;858
sénat France 1006
sénateur France 1006
sexisme Violence 924;988;1010;1081;1127;1159
sexualité Varia 743;1155
siné Varia 1071
smic Varia 1148
social Varia 1077
soeur emmanuelle Varia 853
sondages Varia 993;1023;1176
sotchi Varia 1131
sport Varia 882;934;937;938;939;940;1049;1050;1098;1115;1131;1148;1149
st valentin Varia 869
stock-options Varia 875
suicide Varia 854;902;903;1086
suisse International 1087
surpopulation Varia 1077
syrie International 1106;1107;1170
taser Varia 859
taxe Varia 899;1116;1117
télé Varia 864;1083;1085;1151;1169;1175
téléphone Varia 898
téléthon Varia 911
tempête Varia 924;925
terrorisme Terrorisme 955;959;985;1032;1166;1178;1179
tf1 Varia 919;960;1085
théâtre Varia 1016;1151
torah Religion 983
torture Violence 1011
totalitarisme Varia 975
toulouse France 1031;1032
trader Varia 895
train Varia 1100
travail Varia 1111
tueur de masse Violence 1076
tunisie International 971;972;973;1011
tva Varia 1024
twitter Varia 1045
ukraine International 1133
ump France 897;925;926;948;958;961;979;1004;1014;1015;1036;1040;1053;1066;1067;1094;1105
université Varia 883
usa International 846;851;855;918;987;1063
vacances Varia 862;890;891;895;972;993;1103;1104
vache folle Varia 1079
valérie trierweiler France 1044;1045;1047;1098;1125;1127;1128;1160
vatican Religion 877;1041;1082
vie privée Varia 1126
villepin France 901
violence Violence 865;883
voeux Varia 967;1019
voile Religion 888;1049;1122
voiture Varia 957
wall street Varia 851
woerth France 930;942;943;944;951
xavier darcos France 865
xénophobie Violence 900;947;948;949;953;986;996;1026;1114;1124;1125;1145;1165;1167
yannick noah France 1014
zemmour France 970;1165;1167
themes <- function(x) {
y = rep(NA, length(x))
# france
y[ x %in% c("bretagne", "corse", "guadeloupe", "marseille", "nice", "paris",
"provence-alpes-côte d'azur", "régions françaises",
"toulouse") ] = "France"
# politique française
y[ x %in% c("allègre", "bayrou", "bernard tapie",
"bernadette chirac", "besson", "bettencourt", "borloo","boutin",
"cécile duflot", "chirac", "christiane taubira", "clearstream",
"cohn-bendit", "copé", "darcos", "dati", "de gaulle", "droite",
"dsk", "elysée", "européennes", "eva joly", "extrême droite",
"fabius", "fillon", "florian philippot", "fn", "françois hollande",
"françois mitterrand", "frédéric mitterrand", "gauche", "grenelle",
"guéant", "hortefeux", "hulot", "jean-luc mélenchon", "jean-marc ayrault",
"jean-yves le drian", "jérôme cahuzac", "juppé", "kouchner",
"lagarde", "le pen", "législatives", "les verts", "luc chatel",
"luc ferry", "mam", "manuel valls", "marine le pen", "martine aubry",
"michel sapin", "ministre", "montebourg", "municipales",
"nadine morano", "najat belkacem", "nathalie kosciusko-morizet",
"patrick buisson", "pierre moscovici", "président", "présidentielle",
"présidentielle 2007", "présidentielle 2012", "présidentielle 2017",
"primaires", "ps", "rama yade", "régionales", "rolex",
"roselyne bachelot", "sarkozy", "ségolène royal", "sénat", "sénateur",
"ump", "valérie trierweiler", "villepin", "woerth",
"xavier darcos") ] = "Politique"
# économie : banques, capitalisme, industrie
y[ x %in% c("air france", "airbus", "banques", "bernard arnault", "bnp", "bourse",
"capitalisme", "crédit lyonnais", "crise", "crise économique 2009",
"crise financière 2008", "entreprises", "florange", "fmi", "krach", "lcl",
"les firmes", "les riches", "medef", "parisot", "stock-options", "trader",
"wall street") ] = "Capitalisme"
# showbiz et assimilés
y[ x %in% c("anne sinclair", "bigard", "carla bruni", "closer", "delarue",
"dieudonné", "frigide barjot", "galliano", "gérard depardieu",
"georges lautner", "houellebecq",
"jean-pierre pernaut", "johnny hallyday", "madonna", "michael jackson",
"mireille mathieu", "miss france", "polanski", "raymond domenech", "yannick noah",
"zemmour") ] = "Showbiz"
# politique internationale
y[ x %in% c("afghanistan", "ahmadinejad", "algérie", "allemagne", "bachar al-assad",
"belgique", "ben ali", "berlusconi", "brésil", "bush", "durban", "egypte",
"gaza", "grèce", "haïti", "irak", "israël", "israël-palestine", "japon",
"kadhafi", "les dictatures", "libye", "mali", "maroc", "merkel", "mitt romney",
"moubarak", "nigeria", "norvège", "obama", "palestine", "poutine", "pussy riot", "qatar",
"rio", "roumanie", "russie", "suisse", "syrie", "tunisie", "ukraine", "usa" ) ] = "International"
# terrorisme
y[ x %in% c("al-qaida", "11 septembre", "ben laden", "jihad", "mohamed merah", "otage",
"terrorisme") ] = "Terrorisme"
# religion
y[ x %in% c("antisémitisme", "athées", "benoit xvi", "bible", "burka", "charia", "coran",
"dieu", "évêque", "françois 1er", "intégrisme", "jesus", "laïcité", "lourdes",
"mahomet", "opus dei", "pape", "prophète", "ramadan", "religion", "torah",
"vatican", "voile") ] = "Religion"
# armée et police
y[ x %in% c("14 juillet", "armée", "armes", "crs", "police") ] = "Armée/Police"
# violence
y[ x %in% c("assassinat", "attentats", "discrimination", "guerre", "homophobie",
"incendie criminel", "massacre", "parricide", "pédophilie", "sexisme",
"torture", "tueur de masse", "violence", "xénophobie") ] = "Violence"
# médias
y[ x %in% c("cavanna", "le monde", "libération", "figaro", "france télévision", "hara-kiri",
"radio", "radio france", "siné", "télé", "tf1") ] = "Medias"
# residuals
# 'bac' is the diploma, not the police unit
# 'bizutage' is not necessarily violent
y[ x %in% c("2022", "absentéisme", "agriculture", "alcoolisme",
"alimentation", "amende", "anniversaire", "astérix", "aviation", "avion", "bac",
"ballerine", "banditisme", "bateau", "batman", "bizutage", "bonne année", "bonus",
"budget", "ça déraille", "carbone", "caricature", "catastrophes naturelles",
"charter", "chômage", "cinéma", "climat", "clonage", "clowns",
"cocaïne", "colonialisme", "coluche", "concept cars", "consommation",
"corruption", "crash test", "croissance",
"délation", "démocratie", "derrick", "dimanche", "disparus", "drh", "drogue",
"ebola", "école", "écologie", "économie",
"écoute", "education nationale", "élections", "elvis",
"émigration", "enseignement", "environnement",
"épidémie", "essence", "été", "europe", "examen", "expulsions",
"facebook", "famille", "farine animale", "femen",
"foot", "france", "france télécom",
"gigolo", "glou glou", "grève", "grippe",
"hitler", "humanisme", "immigration", "impôts", "intempéries", "internet",
"iron man", "jeanne d'arc", "jo", "journal", "journalisme", "journée de...",
"justice", "l'oréal",
"la poste", "le cimetière", "livres", "logement", "lois",
"malaise", "manifestations", "mariage",
"mc donald's", "médicament", "mer", "mode",
"nazisme", "négationnisme", "népotisme", "noël",
"nucléaire", "nudisme", "ouverture",
"patrimoine", "pétrole", "phallocrate", "pharmaceutique",
"philosophie", "pip", "pirates", "pitbulls",
"plage", "pma", "pollution", "poséidon adventure",
"préservatif", "presse",
"prisons", "privatisation", "prof", "prostitution", "pub",
"recyclage", "réforme", "régime",
"remaniement", "rentrée", "répression", "retraite",
"révolution", "rigueur", "rumeur", "santé", "science",
"sdf", "sécurité", "sécurité sociale",
"sexualité", "smic", "social",
"soeur emmanuelle", "sondages", "sotchi", "sport", "st valentin",
"suicide", "surpopulation", "taser", "taxe", "téléphone", "téléthon",
"tempête", "théâtre", "totalitarisme", "train", "travail", "tva", "twitter",
"université", "vacances", "vache folle",
"vie privée", "voeux", "voiture") ] = "Varia"
# simplify
y[ y %in% c("Showbiz", "Politique") ] = "France"
y[ y %in% c("Capitalisme", "Armée/Police", "Medias") ] = "Varia"
return(y)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment