briatte/charlie-data.r

## charlie-data.r
#
# download 338 Charlie Hebdo covers with keywords
#

library(dplyr)
library(XML)
library(lubridate)
library(stringr)
library(ggplot2)

source("charlie-themes.r")

dir.create("pages", showWarnings = FALSE)
dir.create("covers", showWarnings = FALSE)

covers = "charlie.csv"
if(!file.exists(covers)) {

  d = data.frame()
  for(i in seq(330, 0, -10)) {

    cat("Parsing page", sprintf("%2.0f", i / 10))
    file = paste0("pages/page-", i, ".html")

    if(!file.exists(file))
      h = try(download.file(paste0("http://stripsjournal.canalblog.com/tag/Les%20Unes%20de%20Charlie%20Hebdo/p",
                           i, "-0.html"), file, mode = "wb", quiet = TRUE), silent = TRUE)

    if(!file.info(file)$size) {

      cat(": failed\n")
      file.remove(file)

    } else {

      h = htmlParse(file)

      addr = xpathSApply(h, "//div[@class='blogbody']//meta[@itemprop='url']/@content")
      date = xpathSApply(h, "//div[@class='blogbody']//meta[@itemprop='dateCreated']/@content")
      titl = xpathSApply(h, "//div[@class='blogbody']//h3", xmlValue)

      dat = gsub("(.*) - (.*) - (.*)", "\\3", titl)
      dat = gsub("(.*)- (.*)", "\\2", dat)
      dat = gsub("octore", "octobre", dat)
      dat = gsub("sept\\.", "sep.", dat)
      dat = parse_date_time(gsub("\\.", "", dat), "%d %m %y", locale = "fr_FR")
      dat = as.Date(dat)

      # fix one unparsable date
      dat[ addr == "http://stripsjournal.canalblog.com/archives/2012/11/14/26045046.html" ] =
        as.Date("2012-11-14")

      file = gsub("(.*) - (.*) - (.*)", "\\2 - \\1", titl)
      file = gsub("Charlie Hebdo Nª", "", file)
      file = paste(dat, file)

      kwd = xpathSApply(h, "//div[@class='blogbody']//h3/following-sibling::div[@class='itemfooter'][1]")
      kwd = lapply(kwd, xpathSApply, "a[@rel='tag']", xmlValue)

      # fix a few keywords
      kwd = lapply(kwd, function(x) {
        y = tolower(x[ !x %in% c("Charlie Hebdo", "Les Unes de Charlie Hebdo") ])
        y[ y == "aubry" ] = "martine aubry"
        y[ y == "crise économique 2009..." ] = "crise économique 2009"
        y[ y == "tapie" ] = "bernard tapie"
        y[ y == "emmigration" ] = "émigration"
        y[ y == "univercité" ] = "université"
        return(gsub("œ", "oe", y))
      })
      kwd = sapply(kwd, paste0, collapse = ";")

      aut = xpathSApply(h, "//div[@class='blogbody']//h3/following-sibling::div[@class='itemfooter'][2]")
      aut = lapply(aut, xpathSApply, "a[contains(@href, 'archives')]", xmlValue)
      aut = sapply(aut, head, 1)

      # img = xpathSApply(h, "//div[@class='blogbody']//h3/following-sibling::p//img/@src")
      img = xpathSApply(h, "//div[@class='blogbody']//h3/following-sibling::div[@class='itemfooter'][1]/following-sibling::p[2]")
      img = sapply(img, xpathSApply, "a/img/@src")
      img = sapply(img, function(x) ifelse(is.null(x), NA, x))

      # fix six parser errors
      img[ addr == "http://stripsjournal.canalblog.com/archives/2012/04/30/24141374.html" ] =
        "http://p0.storage.canalblog.com/06/27/177230/75268181.jpg"
      img[ addr == "http://stripsjournal.canalblog.com/archives/2013/11/20/28478698.html" ] =
        "http://p2.storage.canalblog.com/22/38/177230/91674307.jpg"
      img[ addr == "http://stripsjournal.canalblog.com/archives/2013/11/12/28419822.html" ] =
        "http://p1.storage.canalblog.com/15/42/177230/91450058_o.jpg"
      img[ addr == "http://stripsjournal.canalblog.com/archives/2013/11/05/28369503.html" ] =
        "http://p2.storage.canalblog.com/28/47/177230/91251269_o.jpg"
      img[ addr == "http://stripsjournal.canalblog.com/archives/2013/10/29/28316947.html" ] =
        "http://p4.storage.canalblog.com/44/10/177230/91032930_o.jpg"
      img[ addr == "http://stripsjournal.canalblog.com/archives/2015/01/12/31306656.html" ] =
        "http://p6.storage.canalblog.com/68/20/177230/101510919_o.png"

      file = paste0(file, gsub("(.*)\\.(gif|jpg|png)", ".\\2", img))

      d = rbind(d, data.frame(post_page = i,
                              post_url = addr, post_date = date, post_title = titl,
                              date = dat, tags = kwd, author = aut, image = img,
                              file, stringsAsFactors = FALSE))

      cat(":", sprintf("%3.0f", nrow(d)), "total covers\n")

    }

  }

  # fix three missing authors
  d$author[ d$author == "Index Dessinateurs" ] = NA
  d$author[ d$post_url == "http://stripsjournal.canalblog.com/archives/2009/10/28/30971837.html" ] =
    "Luz"
  d$author[ d$post_url == "http://stripsjournal.canalblog.com/archives/2014/05/21/30964112.html" ] =
    "Cabu"
  d$author[ d$post_url == "http://stripsjournal.canalblog.com/archives/2014/05/14/30964082.html" ] =
    "Cabu"

  write.csv(d, covers, row.names = FALSE)

}

d = read.csv(covers, stringsAsFactors = FALSE)

for(i in which(!is.na(d$image))) {

  file = paste0("covers/", d$file[ i ])

  if(!file.exists(file))
    try(download.file(d$image[ i ], file, quiet = TRUE), silent = TRUE)

}

# dates
print(table(year(d$date), exclude = NULL))

d$year = year(d$date)
d$quarter = paste0(d$year, "_", quarter(d$date))

# numérotation
d$num = str_extract(d$post_title, "°\\d+")
d$num[ is.na(d$num) ] = str_extract(d$post_title[ is.na(d$num) ], "Charlie Hebdo \\d+")
d$num = gsub("\\D", "", d$num)
stopifnot(n_distinct(d$num) == nrow(d))

# authors
print(table(d$author, exclude = NULL))

# keywords
terms = unlist(strsplit(d$tags, ";"))

cat("\n", length(unique(terms)), "unique keywords:\n")
print(table(terms)[ table(terms) > quantile(table(terms), .99) ])

# themes
stopifnot(!length(terms[ is.na(themes(terms)) ]))

# complete edge list
full = data.frame()
for(i in 1:nrow(d)) {

  y = unlist(strsplit(d$tags[ i ], ";"))
  full = rbind(full, data.frame(
    expand.grid(i = y, j = y, stringsAsFactors = FALSE),
    author = d$author[ i ],
    quarter = d$quarter[ i ],
    weight = 1 / (1 + str_count(d$tags[ i ], ";")) # inverse weighting
  ))

}

csv = data.frame(tag = unique(terms), theme = themes(unique(terms))) %>%
  arrange(tag)

head(csv)

csv$nums = NA
K = strsplit(d$tags, ";")
for(i in csv$tag) {

  k = lapply(K, function(x) i %in% x)
  k = as.numeric(d$num[ which(sapply(k, isTRUE)) ])
  if(length(k))
    csv$nums[ csv$tag == i ] = paste0(k [ order(k) ], collapse = ";")

}

write.csv(csv, "charlie-tags.csv", row.names = FALSE)

#
# Thematic plots
#

th = c(
  "Armée/Police" = "Varia",
  "Capitalisme" = "Varia",
  "France" = "Varia",
  "International" = "International",
  "Medias" = "Varia",
  "Politique" = "Politique",
  "Religion" = "Religion",
  "Showbiz" = "Varia",
  "Terrorisme" = "Violence",
  "Varia" = "Varia",
  "Violence" = "Violence"
)

yy = lapply(d$tags, function(x) {
  y = unlist(strsplit(x, ";"))
  y = th[ themes(y, simplify = F) ]
  unique(y)
})

dd = data.frame()
for(i in unique(d$year)) {
  y = unlist(yy[ d$year == i ])
  print(i)
  print(table(y))
  dd = rbind(dd, data.frame(year = i, theme = names(table(y)),
                            freq = as.vector(table(y)),
                            prop = as.vector(table(y) / sum(table(y)))))
}

colors = c(
  "Politique" = "#80b1d3",        # Set3 / light blue
  "International" = "#fdb462", # Set3 / light orange
  "Religion" = "#4daf4a",      # Set1 / bright green
  "Terrorisme" = "#e41a1c",    # Set1 / bright red
  "Varia" = "#999999",         # Set1 / grey
  "Violence" = "#984ea3"       # Set1 / bright purple
)

qplot(data = dd, fill = theme, y = prop, x = factor(year), stat = "identity", geom = "bar") +
  scale_fill_manual("Thème", values = colors2) +
  theme_bw() +
  theme(panel.grid = element_blank()) +
  labs(y = "Fréquence relative\n", x = "\nAnnée")

ggsave("plots/charlie_themes_prop.pdf", width = 9, height = 8)

qplot(data = dd, fill = theme, y = freq, x = factor(year), stat = "identity", geom = "bar") +
  scale_fill_manual("Thème", values = colors2) +
  theme_bw() +
  theme(panel.grid = element_blank()) +
  labs(y = "Fréquence brute\n", x = "\nAnnée")

ggsave("plots/charlie_themes_freq.pdf", width = 9, height = 8)

# done

## charlie-ergm.r
#
# quick and dirty ERGM specification for the co-occurrence network
# NB: to be run after the exploratory models in charlie-models.r
#

# edge list

e = subset(full[, 1:2 ], i != j)
e$u = apply(e, 1, function(x) paste0(sort(x), collapse = "///"))
e = data.frame(table(e$u))
e = data.frame(i = gsub("(.*)///(.*)", "\\1", e$Var1),
               j = gsub("(.*)///(.*)", "\\2", e$Var1),
               n = e$Freq / 2)

# network

n = network(e[, 1:2 ], directed = FALSE)
t = themes(network.vertex.names(n), simplify = FALSE)

n %n% "covers" = 338
n %v% "theme" = t

set.edge.attribute(n, "weight", ifelse(e$n > 10, 10, e$n))
print(table(n %e% "weight"))

# > subset(e, n > 10)
# i                   j  n
# 779            élections      présidentielle 23
# 781            élections présidentielle 2012 22
# 789            élections             sarkozy 17
# 871       extrême droite                  fn 17
# 880       extrême droite       marine le pen 14
# 961                   fn       marine le pen 15
# 1189          intégrisme            religion 17
# 1563 présidentielle 2012             sarkozy 15
# 1571      présidentielle présidentielle 2012 22
# 1575      présidentielle             sarkozy 15

th = c(
  "Armée/Police" = "Varia",
  "Capitalisme" = "Varia",
  "France" = "Varia",
  "International" = "International",
  "Medias" = "Varia",
  "Politique" = "Politique",
  "Religion" = "Religion",
  "Showbiz" = "Varia",
  "Terrorisme" = "Violence",
  "Varia" = "Varia",
  "Violence" = "Violence"
)

# collapse themes
n %v% "theme" = th[ n %v% "theme" ]

print(table(n %v% "theme", exclude = NULL))

E = ergm(n ~ nodemix("theme"))
b = which(names(coef(E)) %in% c("mix.theme.France.France",
                                "mix.theme.International.International",
                                "mix.theme.Politique.Politique",
                                "mix.theme.Religion.Religion",
                                "mix.theme.Showbiz.Showbiz",
                                "mix.theme.Terrorisme.Terrorisme",
                                "mix.theme.Varia.Varia",
                                "mix.theme.Violence.Violence"))

b = c(b, which(grepl("Varia", names(coef(E)))))

#
# Baseline model
#

B = ergm(n ~ edges +
           nodefactor("theme", base = 4) +
           nodematch("theme", diff = TRUE, keep = c(1:3, 5)) +
           nodemix("theme", base = b),
         parallel = 4,
         control = control.ergm(seed = 4575,
                                MCMLE.termination = "precision",
                                MCMLE.effectiveSize = 50,   # enable adaptive burnin and interval
                                MCMLE.MCMC.precision = .01, # control how much noise is tolerable
                                MCMLE.maxit = 50))

print(summary(B))

#
# GWDSP-adjusted model
#

D = ergm(n ~ edges +
           nodefactor("theme", base = 4) +
           nodematch("theme", diff = TRUE, keep = c(1:3, 5)) +
           nodemix("theme", base = b) +
           gwdsp(1, fixed = TRUE),
         parallel = 4,
         control = control.ergm(seed = 4575,
                                MCMLE.termination = "precision",
                                MCMLE.effectiveSize = 50,   # enable adaptive burnin and interval
                                MCMLE.MCMC.precision = .01, # control how much noise is tolerable
                                MCMLE.maxit = 50))

print(summary(D))

#
# GWD-adjusted models, at various values of alpha decay
#

a = seq(0, 5, by = 0.1)
alphas = c()
L = list()

for(i in a) {

  cat("\nAttempting to fit at decay alpha =", i, "\n")

  E = try(ergm(n ~ edges +
             nodefactor("theme", base = 4) +
             nodematch("theme", diff = TRUE, keep = c(1:3, 5)) +
             nodemix("theme", base = b) +
             gwdegree(i, fixed = TRUE), # gwesp(1, fixed = TRUE), gwdsp(1, fixed = TRUE),
           parallel = 4,
           control = control.ergm(seed = 4575,
                                  MCMLE.termination = "precision",
                                  MCMLE.effectiveSize = 50,   # enable adaptive burnin and interval
                                  MCMLE.MCMC.precision = .01, # control how much noise is tolerable
                                  MCMLE.maxit = 50)), silent = TRUE)

  if(!"try-error" %in% class(E)) {

    print(summary(E))
    L[[ which(a == i) ]] = E
    alphas = c(alphas, i)

  }

}

L = L[ !sapply(L, is.null) ]
r = data.frame()

for(i in 1:length(L)) {

  r = rbind(r, data.frame(
    x = names(coef(L[[ i ]])),
    b = summary(L[[ i ]])$coefs[, 1],
    se = summary(L[[ i ]])$coefs[, 2],
    alpha = alphas[ i ],
    stringsAsFactors = FALSE))

}

r = bind_rows(r)
r$ub = r$b + 2 * r$se
r$lb = r$b - 2 * r$se

# see how the GWD term moves: a < 1 and a > 3 are alright
qplot(data = subset(r, grepl("gwd", x)),
      x = factor(alpha), y = b, ymin = lb, ymax = ub, geom = "pointrange") +
  geom_hline(y = 0, lty = "dashed") +
  facet_wrap(~ alpha, scales = "free", nrow = 1) +
  labs(y = "Coefficient estimate\n", x = "\nGWD alpha decay parameter") +
  theme_bw() +
  theme(panel.grid = element_blank())

r = subset(r, alpha < 1 | alpha > 3)

qplot(data = subset(r, grepl("nodefactor", x)),
      x = factor(alpha), y = b, ymin = lb, ymax = ub, geom = "pointrange") +
  geom_hline(y = 0, lty = "dashed") +
  facet_wrap(~ x) +
  theme_bw()

qplot(data = subset(r, grepl("nodematch", x)),
      x = factor(alpha), y = b, ymin = lb, ymax = ub, geom = "pointrange") +
  geom_hline(y = 0, lty = "dashed") +
  facet_wrap(~ x) +
  theme_bw()

qplot(data = subset(r, grepl("mix", x)),
      x = factor(alpha), y = b, ymin = lb, ymax = ub, geom = "pointrange") +
  geom_hline(y = 0, lty = "dashed") +
  facet_wrap(~ x) +
  labs(y = "Coefficient estimate\n", x = "\nGWD alpha decay parameter") +
  theme_bw() +
  theme(panel.grid = element_blank())

save(B, D, L, alphas, n, file = "charlie-ergm.rda")

# have a nice day

## charlie-models.r
#
# quick and dirty ERGM specifications for the co-occurrence networks
#

library(ergm)
library(ergm.count)

coefs = data.frame()

for(i in ls(pattern = "^net_")) {

  nn = get(i)

  # collapse general themes
  nn %v% "theme" = ifelse(nn %v% "theme" %in% c("France", "International"), "Varia", nn %v% "theme")

  cat("\n", i, ":\n\n")
  print(table(nn %v% "theme", exclude = NULL))

  E = ergm(nn ~ nodemix("theme"))
  b = which(names(coef(E)) %in% c("mix.theme.France.France", # useless
                                  "mix.theme.International.International",
                                  "mix.theme.Religion.Religion",
                                  "mix.theme.Terrorisme.Terrorisme",
                                  "mix.theme.Varia.Varia",
                                  "mix.theme.Violence.Violence"))

  b = c(b, which(grepl("Varia", names(coef(E)))))

  if(!exists(paste0("ergm_", i))) {

    # binary model for all networks
    E = ergm(nn ~ edges + nodefactor("theme") + nodemix("theme", base = b),
             control = control.ergm(seed = 4575,
                                    MCMLE.termination = "precision",
                                    MCMLE.effectiveSize = 50,   # enable adaptive burnin and interval
                                    MCMLE.MCMC.precision = .01, # control how much noise is tolerable
                                    MCMLE.maxit = 50))

    # add weighted model for author-specific networks
    if(!grepl("\\d", i)) {

      W = try(ergm(nn ~ nonzero + sum + nodefactor("theme") + nodemix("theme", base = b),
                   response = "weight", reference = ~ Poisson,
                   control = control.ergm(seed = 4575,
                                          MCMC.prop.weights = "0inflated",
                                          MCMLE.trustregion = 1000,
                                          MCMLE.termination = "precision",
                                          MCMLE.effectiveSize = 50,   # enable adaptive burnin and interval
                                          MCMLE.MCMC.precision = .01, # control how much noise is tolerable
                                          MCMLE.maxit = 50)))

      assign(paste0("wergm_", i), W)

    }

  } else {

    E = get(paste0("ergm_", i))

  }

  assign(paste0("ergm_", i), E)

  coefs = rbind(coefs, data.frame(
    x0 = "B",
    network = gsub("net_", "", i),
    x = names(coef(E)),
    b = coef(E),
    se = summary(E)$coefs[ , 2 ],
    stringsAsFactors = FALSE
  ))

  if(exists(paste0("wergm_", i))) {

    W = get(paste0("wergm_", i))

    if(!"try-error" %in% class(W))
      coefs = rbind(coefs, data.frame(
        x0 = "W",
        network = gsub("net_", "", i),
        x = names(coef(W)),
        b = coef(W),
        se = summary(W)$coefs[ , 2 ],
        stringsAsFactors = FALSE
      ))

  }

}

#
# temporal ERGM results
#

qplot(data = subset(coefs, grepl("20(09|10|11|12|13|14)", network) &
                      x0 == "B" & grepl("mix", x) & !is.infinite(b) & se < 10),
      y = b, x = x0, color = x) +
  geom_segment(aes(xend = x0, y = b - 2 * se, yend = b + 2 *se)) +
  geom_hline(y = 0, lty = "dashed") +
  facet_grid(x ~ network) +
  scale_color_brewer("Coefficients", palette = "Set1") +
  labs(x = NULL, y = "log Pr( co-occurrence )\n") +
  theme_bw() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        panel.grid = element_blank(),
        legend.position = "bottom")

ggsave("ergm_time.pdf", width = 9, height = 8)

#
# author-specific ERGM results, binary and valued
#

coefs$x = gsub("sum\\.", "", coefs$x)

qplot(data = subset(coefs, grepl("Cabu|Charb|Riss|Luz", network) &
                      grepl("mix", x) & !is.infinite(b) & se < 10),
      y = b, x = x0, color = x) +
  geom_segment(aes(xend = x0, y = b - 2 * se, yend = b + 2 *se)) +
  geom_hline(y = 0, lty = "dashed") +
  facet_grid(x ~ network) +
  scale_color_brewer("Coefficients", palette = "Set1") +
  labs(x = NULL, y = "log Pr( co-occurrence )\n") +
  theme_bw() +
  theme(#axis.text.x = element_blank(),
        #axis.ticks.x = element_blank(),
        panel.grid = element_blank(),
        legend.position = "bottom")

ggsave("ergm_authors.pdf", width = 9, height = 8)

# kthxbye

## charlie-networks.r
#
# build temporal and author-specific co-occurrence networks
#

library(animation)
library(dplyr)
library(GGally)
library(ggplot2)
library(network)

source("charlie-data.r")

dir.create("plots", showWarnings = FALSE)

colors = c(
  "France" = "#80b1d3",        # Set3 / light blue
  "International" = "#fdb462", # Set3 / light orange
  "Religion" = "#4daf4a",      # Set1 / bright green
  "Terrorisme" = "#e41a1c",    # Set1 / bright red
  "Varia" = "#999999",         # Set1 / grey
  "Violence" = "#984ea3"       # Set1 / bright purple
)

#
# time-specific term networks
#

prop = data.frame()

for(k in names(table(d$quarter)[ table(d$quarter) > 10 | names(table(d$quarter)) == "2015_1" ])) {

  # remove self-loops and select author
  e = subset(full, i != j & quarter == k)[, 1:2 ]

  cat("Building:", k, nrow(e), "edges\n")

  # aggregate similar rows
  e$u = apply(e, 1, function(x) paste0(sort(x), collapse = "///"))
  e = data.frame(table(e$u))

  e = data.frame(i = gsub("(.*)///(.*)", "\\1", e$Var1),
                 j = gsub("(.*)///(.*)", "\\2", e$Var1),
                 n = e$Freq / 2)

  n = network(e[, 1:2 ], directed = FALSE)
  t = themes(network.vertex.names(n))

  n %n% "period" = k
  n %v% "theme" = t

  set.edge.attribute(n, "weight", e$n)
  ## print(table(n %e% "weight"))

  # compare network dimensions
  nv0 = network.size(n)
  ne0 = network.edgecount(n)
  nd = network.density(n)

  g = ggnet(n, label.nodes = network.vertex.names(n),
        node.group = t, node.color = colors[ names(colors) %in% t ],
        size = 0, label.size = 4, segment.color = "grey50",
        segment.alpha = .5) +
    theme(legend.key = element_blank(), legend.position = "bottom") +
    ggtitle(paste("Période", gsub("_", "-", k),
                  "n =", table(d$quarter)[ names(table(d$quarter)) == k ],
                  "'unes'", nv0, "mots-clés", ne0, "co-occurrences\n"))

  ggsave(paste0("plots/network_", k, ".pdf"), g, width = 12, height = 12)

  assign(paste0("plot_", k), g)
  assign(paste0("net_", k), n)
  assign(paste0("edges_", k), e)

  # subset to selected themes
  t = t %in% c("Religion", "Terrorisme", "Violence")
  delete.vertices(n, which(!t))

  prop = rbind(prop, data.frame(t = k,
                                n = table(d$quarter)[ names(table(d$quarter)) == k ],
                                d = nd,
                                nv0, nv1 = network.size(n),
                                ne0, ne1 = network.edgecount(n),
                                stringsAsFactors = FALSE))

  print(tail(prop, 1))

}

stop()

# dimensions

summary(prop$d)

prop$pv = with(prop, nv1 / nv0)
prop$pe = with(prop, ne1 / ne0)

# plot

prop$year = substr(prop$t, 1, 4)
prop$quarter = substr(prop$t, 6, 7)

prop = arrange(prop, -pe) %>%
  mutate(color = 1:n()) %>%
  mutate(color = cut(color, breaks = c(1, 3, 7, Inf), include.lowest = TRUE))

qplot(data = subset(prop, year %in% 2009:2014),
      x = quarter, y = pe, group = year, fill = color,
      stat = "identity", geom = "bar") +
  geom_hline(y = prop$pe[ prop$t == "2015_1" ], lty = "dashed") +
  facet_wrap(~ year, nrow = 1) +
  scale_fill_manual("", values = c("[1,3]" = "#2c7fb8", "(3,7]" = "#7fcdbb", "(7,Inf]" = "#edf8b1")) +
  guides(fill = FALSE) +
  labs(y = "Proportion des liens 'Religion, Terrorisme, Violence'\n",
       x = "\nPériode annuelle : 1. janvier-mars 2. avril-juin 3. juillet-septembre 4. octobre-décembre") +
  theme_bw() +
  theme(panel.grid.major.x = element_blank())

ggsave("plots/prop_edges.pdf", width = 9, height = 7)

prop = arrange(prop, -pv) %>%
  mutate(color = 1:n()) %>%
  mutate(color = cut(color, breaks = c(1, 3, 7, Inf), include.lowest = TRUE))

qplot(data = subset(prop, year %in% 2009:2014),
      x = quarter, y = pv, group = year, fill = color,
      stat = "identity", geom = "bar") +
  geom_hline(y = prop$pv[ prop$t == "2015_1" ], lty = "dashed") +
  facet_wrap(~ year, nrow = 1) +
  scale_fill_manual("", values = c("[1,3]" = "#2c7fb8", "(3,7]" = "#7fcdbb", "(7,Inf]" = "#edf8b1")) +
  guides(fill = FALSE) +
  labs(y = "Proportion des mots-clés 'Religion, Terrorisme, Violence'\n",
       x = "\nPériode annuelle : 1. janvier-mars 2. avril-juin 3. juillet-septembre 4. octobre-décembre") +
  theme_bw() +
  theme(panel.grid.major.x = element_blank())

ggsave("plots/prop_nodes.pdf", width = 9, height = 7)

# movie

saveGIF({
  for(i in sort(ls(pattern = "plot_\\d+")))
    print(get(i))
}, movie.name = "network_quarters.gif", ani.width = 600, ani.height = 600)

#
# author-specific term networks
#

for(k in names(table(d$author)[ table(d$author) > 50 ])) {

  # remove self-loops and select author
  e = subset(full, i != j & author == k)[, 1:2 ]

  cat("Building:", k, nrow(e), "edges\n")

  # aggregate similar rows
  e$u = apply(e, 1, function(x) paste0(sort(x), collapse = "///"))
  e = data.frame(table(e$u))
  e = data.frame(i = gsub("(.*)///(.*)", "\\1", e$Var1),
                     j = gsub("(.*)///(.*)", "\\2", e$Var1),
                     n = e$Freq / 2)

  n = network(e[, 1:2 ], directed = FALSE)
  t = themes(network.vertex.names(n))

  n %n% "period" = k
  n %v% "theme" = t

  set.edge.attribute(n, "weight", e$n)
  ## print(table(n %e% "weight"))

  assign(paste0("plot_", k), g)
  assign(paste0("net_", k), n)
  assign(paste0("edges_", k), e)

  ggnet(n, label.nodes = network.vertex.names(n),
        node.group = t, node.color = colors[ names(colors) %in% t ],
        size = 0, label.size = 4, segment.color = "grey50",
        segment.alpha = .5) +
    theme(legend.key = element_blank(), legend.position = "bottom") +
    ggtitle(paste(k, ", n =", table(d$author)[ names(table(d$author)) == k ],
                  "'unes'", network.size(n), "mots-clés",
                  network.edgecount(n), "co-occurrences\n"))

  ggsave(paste0("plots/network_", k, ".pdf"), width = 12, height = 12)

}

# done

## charlie-tags.csv

          
            tag
            theme
            nums

            
              11 septembre
              Terrorisme
              848

            
              14 juillet
              Varia
              996

            
              2022
              Varia
              1177

            
              absentéisme
              Varia
              991

            
              afghanistan
              International
              850

            
              agriculture
              Varia
              974;1116

            
              ahmadinejad
              International
              879

            
              air france
              Varia
              885

            
              airbus
              Varia
              885

            
              al-qaida
              Terrorisme
              985

            
              alcoolisme
              Varia
              935

            
              algérie
              International
              971

            
              alimentation
              Varia
              1079

            
              allègre
              France
              929

            
              allemagne
              International
              1039;1065;1110

            
              amende
              Varia
              884

            
              anne sinclair
              France
              990;994

            
              anniversaire
              Varia
              881;933

            
              antisémitisme
              Religion
              879;977;1013;1031;1124;1125;1153

            
              armée
              Varia
              850;992;996;1075;1099;1106

            
              armes
              Varia
              1133

            
              assassinat
              Violence
              1031;1032;1062

            
              astérix
              Varia
              1061

            
              athées
              Religion
              847;983

            
              attentats
              Violence
              1118

            
              aviation
              Varia
              885;890

            
              avion
              Varia
              885;890;1154

            
              bac
              Varia
              887;991;1096

            
              bachar al-assad
              International
              1026;1106

            
              ballerine
              Varia
              1005

            
              banditisme
              Varia
              1107;1109

            
              banques
              Varia
              849;852;1013;1044;1102;1179

            
              bateau
              Varia
              1022

            
              batman
              Varia
              1049

            
              bayrou
              France
              886;898

            
              belgique
              International
              1056;1069

            
              ben ali
              International
              976

            
              ben laden
              Terrorisme
              959;985

            
              benoit xvi
              Religion
              847;877;928;962;1041;1070;1078;1080;1081

            
              berlusconi
              International
              1081

            
              bernadette chirac
              France
              1128

            
              bernard arnault
              Varia
              1056

            
              bernard tapie
              France
              1096;1102

            
              besson
              France
              880;900;906;908;914;921;952

            
              bettencourt
              France
              942;944;945

            
              bible
              Religion
              983

            
              bigard
              France
              848

            
              bizutage
              Varia
              1039

            
              bnp
              Varia
              1013

            
              bonne année
              Varia
              915

            
              bonus
              Varia
              895

            
              borloo
              France
              801

            
              bourse
              Varia
              849;851;895;999

            
              boutin
              France
              1092

            
              brésil
              International
              862;873;1101

            
              bretagne
              France
              1116;1117

            
              budget
              Varia
              1140

            
              burka
              Religion
              888;920

            
              bush
              International
              855

            
              ça déraille
              Varia
              1100

            
              capitalisme
              Varia
              876

            
              carbone
              Varia
              899

            
              caricature
              Varia
              1057

            
              carla bruni
              France
              914;927;931;946;1003;1006;1027;1030;1161

            
              catastrophes naturelles
              Varia
              918;978

            
              cavanna
              Varia
              1129

            
              cécile duflot
              France
              1132

            
              charia
              Religion
              1010;1011

            
              charter
              Varia
              950

            
              chirac
              France
              976;1000;1004

            
              chômage
              Varia
              850;868;937;1048;1079

            
              christiane taubira
              France
              1132

            
              cinéma
              Varia
              831;928;999;1091;1119;1151

            
              clearstream
              France
              901

            
              climat
              Varia
              924;925;929

            
              clonage
              Varia
              912

            
              closer
              France
              1174

            
              clowns
              Varia
              1168

            
              cocaïne
              Varia
              882

            
              cohn-bendit
              France
              886

            
              colonialisme
              Varia
              1153

            
              coluche
              Varia
              852;1171

            
              concept cars
              Varia
              957

            
              consommation
              Varia
              1018

            
              copé
              France
              1040;1053;1063;1066;1067;1070;1094;1105

            
              coran
              Religion
              983;1099

            
              corruption
              Varia
              973

            
              corse
              France
              1062

            
              crash test
              Varia
              885;890;1154

            
              crédit lyonnais
              Varia
              1102

            
              crise
              Varia
              849;851;852;853;860;868;872;877;891;896;937;999;1047;1052;1090;1100;1140;1152;1157

            
              crise économique 2009
              Varia
              868;872;877;891;896;937

            
              crise financière 2008
              Varia
              849;851;852;853;860

            
              croissance
              Varia
              870;896;1104;1125;1152;1157

            
              crs
              Varia
              949;958

            
              darcos
              France
              910

            
              dati
              France
              854

            
              de gaulle
              France
              939;1093

            
              delarue
              France
              1054

            
              délation
              Varia
              880

            
              démocratie
              Varia
              1094

            
              derrick
              Varia
              861

            
              dieu
              Religion
              1064;1108

            
              dieudonné
              France
              1124;1125

            
              dimanche
              Varia
              1111

            
              discrimination
              Violence
              966

            
              disparus
              Varia
              885

            
              drh
              Varia
              903

            
              drogue
              Varia
              1014;1054

            
              droite
              France
              887;926;1093

            
              dsk
              France
              968;987;989;990;993;994;998;1005;1010;1015;1051;1068;1081;1086;1134;1143;1160

            
              durban
              International
              879

            
              ebola
              Varia
              1156

            
              école
              Varia
              1073;1108;1130;1142

            
              écologie
              Varia
              801;899;982;1116

            
              économie
              Varia
              870;872;895;923;934;1104;1110;1125;1140;1141;1152;1157

            
              écoute
              Varia
              1003

            
              education nationale
              Varia
              865;883;1142

            
              egypte
              International
              972;1045;1099

            
              élections
              Varia
              729;831;846;855;856;885;886;925;932;970;978;980;982;994;1002;1007;1009;1020;1023;1024;1025;1027;1028;1029;1031;1032;1033;1034;1035;1036;1037;1038;1042;1043;1063;1066;1067;1094;1110;1135;1136;1177

            
              elvis
              Varia
              985

            
              elysée
              France
              1176

            
              émigration
              Varia
              1112

            
              enseignement
              Varia
              865;883

            
              entreprises
              Varia
              910;1141

            
              environnement
              Varia
              801;899;929;1116;1117

            
              épidémie
              Varia
              892;909;1156

            
              essence
              Varia
              957;1001

            
              été
              Varia
              890;891;1048;1103;1104;1156;1157

            
              europe
              Varia
              885;934;953;1013;1017;1039;1065;1112;1144

            
              européennes
              France
              885;886

            
              eva joly
              France
              996

            
              évêque
              Religion
              873

            
              examen
              Varia
              883

            
              expulsions
              Varia
              1114

            
              extrême droite
              France
              921;926;932;965;978;984;997;1028;1029;1031;1036;1043;1066;1070;1095;1109;1112;1135;1147;1172;1174

            
              fabius
              France
              1087;1089

            
              facebook
              Varia
              935

            
              famille
              Varia
              1164

            
              farine animale
              Varia
              1079

            
              femen
              Varia
              1053;1081

            
              figaro
              Varia
              1113

            
              fillon
              France
              868;926;961;964;996;1014;1040;1053;1063;1066;1067;1070;1105;1169

            
              florange
              Varia
              1059;1068

            
              florian philippot
              France
              1174

            
              fmi
              Varia
              987;989

            
              fn
              France
              921;926;932;978;979;984;997;1031;1036;1043;1066;1095;1109;1112;1135;1136;1144;1145;1147;1150;1165;1172;1174

            
              foot
              Varia
              937;938;939;940;986;1115;1148;1149

            
              france
              Varia
              918;1039;1065;1106;1162

            
              france télécom
              Varia
              902

            
              france télévision
              Varia
              864

            
              françois 1er
              Religion
              1082;1083;1101;1179

            
              françois hollande
              France
              846;1008;1009;1023;1033;1038;1039;1044;1045;1047;1048;1052;1055;1061;1072;1075;1086;1089;1090;1091;1098;1103;1107;1110;1114;1119;1121;1122;1123;1125;1126;1132;1134;1139;1146;1152;1157;1160;1171;1176

            
              françois mitterrand
              France
              969;1000

            
              frédéric mitterrand
              France
              904

            
              frigide barjot
              France
              1088;1092

            
              galliano
              France
              977

            
              gauche
              France
              887;1047;1138

            
              gaza
              International
              863;865

            
              georges lautner
              France
              1119

            
              gérard depardieu
              France
              1069;1070;1133

            
              gigolo
              Varia
              942

            
              glou glou
              Varia
              1103;1112

            
              grèce
              International
              1052

            
              grenelle
              France
              801

            
              grève
              Varia
              875;957;1151

            
              grippe
              Varia
              892;909

            
              guadeloupe
              France
              871

            
              guéant
              France
              1014;1025;1029;1040;1053

            
              guerre
              Violence
              850;894;939;979;985;1001;1075;1106;1107;1155

            
              haïti
              International
              918

            
              hara-kiri
              Varia
              1129

            
              hitler
              Varia
              1013

            
              homophobie
              Violence
              1012;1046;1064;1073;1074;1088;1092;1093

            
              hortefeux
              France
              900;948;1040;1053;1105

            
              houellebecq
              France
              1177

            
              hulot
              France
              982

            
              humanisme
              Varia
              991

            
              immigration
              Varia
              880;906;946;1112

            
              impôts
              Varia
              853;1024;1056;1069;1086;1106;1113;1115;1116;1117

            
              incendie criminel
              Violence
              1012

            
              intégrisme
              Religion
              866;888;917;920;959;1011;1012;1016;1049;1057;1058;1070;1073;1075;1099;1130;1142;1143;1162;1163;1166;1170

            
              intempéries
              Varia
              1097

            
              internet
              Varia
              935

            
              irak
              International
              1170

            
              iron man
              Varia
              1091

            
              israël
              International
              863;865;1153

            
              israël-palestine
              International
              863;865;866;1153

            
              japon
              International
              978

            
              jean-luc mélenchon
              France
              1033;1042

            
              jean-marc ayrault
              France
              1055;1072;1087;1089;1122;1132

            
              jean-pierre pernaut
              France
              919;960

            
              jean-yves le drian
              France
              1132

            
              jeanne d'arc
              Varia
              1145

            
              jérôme cahuzac
              France
              1086;1087

            
              jesus
              Religion
              905;966;983;1016;1064;1082;1084;1173

            
              jihad
              Terrorisme
              1170

            
              jo
              Varia
              1049;1050;1131

            
              johnny hallyday
              France
              913;936

            
              journal
              Varia
              1113;1118

            
              journalisme
              Varia
              1050

            
              journée de...
              Varia
              924;1004

            
              juppé
              France
              1014

            
              justice
              Varia
              901;945;974;976;987;990;1071;1102

            
              kadhafi
              International
              975;976;1001

            
              kouchner
              France
              961

            
              krach
              Varia
              999

            
              l'oréal
              Varia
              944;945

            
              la poste
              Varia
              903

            
              lagarde
              France
              849;989

            
              laïcité
              Religion
              981;1108

            
              lcl
              Varia
              1102

            
              le cimetière
              Varia
              853;861;889;930;985;1000;1054;1129

            
              le monde
              Varia
              1003

            
              le pen
              France
              926;965;1028;1029;1031;1109;1147;1150;1165;1172

            
              législatives
              France
              1042;1043;1110

            
              les dictatures
              International
              975

            
              les firmes
              Varia
              902;944;945;1021;1022;1085;1111

            
              les riches
              Varia
              1056

            
              les verts
              France
              996

            
              libération
              Varia
              1118

            
              libye
              International
              975;979;1001;1010;1011

            
              livres
              Varia
              1030;1177

            
              logement
              Varia
              859

            
              lois
              Varia
              1046

            
              lourdes
              Religion
              1097

            
              luc chatel
              France
              1040

            
              luc ferry
              France
              991

            
              madonna
              France
              1042

            
              mahomet
              Religion
              1163;1178

            
              malaise
              Varia
              893

            
              mali
              International
              1074;1075

            
              mam
              France
              971;973;975

            
              manifestations
              Varia
              867;874;875;949;951;956;957;975;992;1073;1074;1099;1113;1164

            
              manuel valls
              France
              1089;1132;1137;1138;1139;1140;1158

            
              mariage
              Varia
              1046;1074;1076;1088;1092;1093

            
              marine le pen
              France
              965;970;977;978;979;984;997;1028;1029;1031;1042;1043;1066;1070;1095;1112;1144;1145;1147;1150;1165;1167;1171;1172;1179

            
              maroc
              International
              971

            
              marseille
              France
              1107

            
              martine aubry
              France
              858;898;968;988;995;1008

            
              massacre
              Violence
              1178

            
              mc donald's
              Varia
              1168

            
              medef
              Varia
              875

            
              médicament
              Varia
              743;916

            
              mer
              Varia
              890;891;895;944

            
              merkel
              International
              1039;1065;1110

            
              michael jackson
              France
              889

            
              michel sapin
              France
              1132

            
              ministre
              France
              1137

            
              mireille mathieu
              France
              1059

            
              miss france
              France
              964;1120

            
              mitt romney
              International
              1063

            
              mode
              Varia
              977

            
              mohamed merah
              Terrorisme
              1060

            
              montebourg
              France
              1089;1132;1141;1158

            
              moubarak
              International
              976

            
              municipales
              France
              1094;1135;1136

            
              nadine morano
              France
              1014;1021

            
              najat belkacem
              France
              1159

            
              nathalie kosciusko-morizet
              France
              1094

            
              nazisme
              Varia
              861;1013

            
              négationnisme
              Varia
              869;879

            
              népotisme
              Varia
              904

            
              nice
              France
              1109

            
              nigeria
              International
              1143

            
              noël
              Varia
              858;860;914;966;1018;1173

            
              norvège
              International
              997

            
              nucléaire
              Varia
              729

            
              nudisme
              Varia
              1132

            
              obama
              International
              846;855;866;1107

            
              opus dei
              Religion
              905

            
              otage
              Terrorisme
              987

            
              ouverture
              Varia
              887;897;927

            
              palestine
              International
              863;865;1153

            
              pape
              Religion
              847;869;877;928;962;1041;1078;1080;1081;1082;1083;1101;1179

            
              paris
              France
              1094

            
              parisot
              Varia
              875

            
              parricide
              Violence
              1076

            
              patrick buisson
              France
              1134

            
              patrimoine
              Varia
              1087

            
              pédophilie
              Violence
              873;894;928;1073

            
              pétrole
              Varia
              1001

            
              phallocrate
              Varia
              1127

            
              pharmaceutique
              Varia
              916

            
              philosophie
              Varia
              887;991

            
              pierre moscovici
              France
              1089

            
              pip
              Varia
              1021;1022

            
              pirates
              Varia
              878

            
              pitbulls
              Varia
              961

            
              plage
              Varia
              890;891;944;997

            
              pma
              Varia
              1077

            
              polanski
              France
              928

            
              police
              Varia
              884;949;958

            
              pollution
              Varia
              1116

            
              poséidon adventure
              Varia
              1022

            
              poutine
              International
              1053;1131;1133

            
              préservatif
              Varia
              877;962;994

            
              président
              France
              1126;1149

            
              présidentielle
              France
              932;970;980;982;994;1002;1009;1020;1023;1024;1025;1027;1028;1029;1031;1032;1033;1034;1035;1036;1037;1038;1063

            
              présidentielle 2007
              France
              729

            
              présidentielle 2012
              France
              932;970;980;982;994;1002;1009;1020;1023;1024;1025;1027;1028;1029;1031;1032;1033;1034;1035;1036;1037;1038

            
              présidentielle 2017
              France
              1161

            
              presse
              Varia
              1003;1118;1174

            
              primaires
              France
              1007;1094

            
              prisons
              Varia
              854;1053;1175

            
              privatisation
              Varia
              903

            
              prof
              Varia
              883

            
              prophète
              Religion
              905;966;983;1016;1064;1082;1084;1163;1178

            
              prostitution
              Varia
              1120

            
              provence-alpes-côte d'azur
              France
              1107;1109

            
              ps
              France
              831;857;888;968;988;1007;1009;1055;1086;1087;1132;1158

            
              pub
              Varia
              861;864;982

            
              pussy riot
              International
              1053

            
              qatar
              International
              1059

            
              radio
              Varia
              941

            
              radio france
              Varia
              941

            
              rama yade
              France
              908;961

            
              ramadan
              Religion
              1177

            
              raymond domenech
              France
              940

            
              recyclage
              Varia
              1091

            
              réforme
              Varia
              876

            
              régime
              Varia
              963

            
              régionales
              France
              925

            
              régions françaises
              France
              1116;1117

            
              religion
              Religion
              847;873;888;917;920;928;962;981;983;1011;1012;1016;1049;1057;1058;1064;1073;1078;1080;1082;1083;1097;1099;1101;1108;1111;1130;1142;1143;1162;1163;1166;1173;1177;1178

            
              remaniement
              Varia
              954;1132;1158

            
              rentrée
              Varia
              1002

            
              répression
              Varia
              975;1026

            
              retraite
              Varia
              922;930;935;949;950;951;955;1078

            
              révolution
              Varia
              972;975;1026

            
              rigueur
              Varia
              1090

            
              rio
              International
              1101

            
              rolex
              France
              877

            
              roselyne bachelot
              France
              909

            
              roumanie
              International
              950

            
              rumeur
              Varia
              995;998

            
              russie
              International
              1131;1133

            
              santé
              Varia
              892;893;909;911;935;1021;1079;1117;1121

            
              sarkozy
              France
              743;858;859;862;863;867;868;870;871;872;874;878;881;882;884;893;899;901;904;907;912;914;915;919;921;922;923;925;927;931;933;941;943;944;946;950;953;954;957;964;967;970;974;979;980;1000;1002;1003;1004;1006;1007;1009;1019;1020;1023;1024;1025;1026;1027;1030;1032;1034;1035;1036;1037;1038;1051;1084;1096;1105;1121;1134;1137;1151;1161;1162;1167;1169;1171;1179

            
              science
              Varia
              917;929

            
              sdf
              Varia
              859;860;874

            
              sécurité
              Varia
              946

            
              sécurité sociale
              Varia
              936

            
              ségolène royal
              France
              729;831;856;857;858

            
              sénat
              France
              1006

            
              sénateur
              France
              1006

            
              sexisme
              Violence
              924;988;1010;1081;1127;1159

            
              sexualité
              Varia
              743;1155

            
              siné
              Varia
              1071

            
              smic
              Varia
              1148

            
              social
              Varia
              1077

            
              soeur emmanuelle
              Varia
              853

            
              sondages
              Varia
              993;1023;1176

            
              sotchi
              Varia
              1131

            
              sport
              Varia
              882;934;937;938;939;940;1049;1050;1098;1115;1131;1148;1149

            
              st valentin
              Varia
              869

            
              stock-options
              Varia
              875

            
              suicide
              Varia
              854;902;903;1086

            
              suisse
              International
              1087

            
              surpopulation
              Varia
              1077

            
              syrie
              International
              1106;1107;1170

            
              taser
              Varia
              859

            
              taxe
              Varia
              899;1116;1117

            
              télé
              Varia
              864;1083;1085;1151;1169;1175

            
              téléphone
              Varia
              898

            
              téléthon
              Varia
              911

            
              tempête
              Varia
              924;925

            
              terrorisme
              Terrorisme
              955;959;985;1032;1166;1178;1179

            
              tf1
              Varia
              919;960;1085

            
              théâtre
              Varia
              1016;1151

            
              torah
              Religion
              983

            
              torture
              Violence
              1011

            
              totalitarisme
              Varia
              975

            
              toulouse
              France
              1031;1032

            
              trader
              Varia
              895

            
              train
              Varia
              1100

            
              travail
              Varia
              1111

            
              tueur de masse
              Violence
              1076

            
              tunisie
              International
              971;972;973;1011

            
              tva
              Varia
              1024

            
              twitter
              Varia
              1045

            
              ukraine
              International
              1133

            
              ump
              France
              897;925;926;948;958;961;979;1004;1014;1015;1036;1040;1053;1066;1067;1094;1105

            
              université
              Varia
              883

            
              usa
              International
              846;851;855;918;987;1063

            
              vacances
              Varia
              862;890;891;895;972;993;1103;1104

            
              vache folle
              Varia
              1079

            
              valérie trierweiler
              France
              1044;1045;1047;1098;1125;1127;1128;1160

            
              vatican
              Religion
              877;1041;1082

            
              vie privée
              Varia
              1126

            
              villepin
              France
              901

            
              violence
              Violence
              865;883

            
              voeux
              Varia
              967;1019

            
              voile
              Religion
              888;1049;1122

            
              voiture
              Varia
              957

            
              wall street
              Varia
              851

            
              woerth
              France
              930;942;943;944;951

            
              xavier darcos
              France
              865

            
              xénophobie
              Violence
              900;947;948;949;953;986;996;1026;1114;1124;1125;1145;1165;1167

            
              yannick noah
              France
              1014

            
              zemmour
              France
              970;1165;1167

## charlie-themes.r
themes <- function(x) {

  y = rep(NA, length(x))

  # france
  y[ x %in% c("bretagne", "corse", "guadeloupe", "marseille", "nice", "paris",
              "provence-alpes-côte d'azur", "régions françaises",
              "toulouse") ] = "France"

  # politique française
  y[ x %in% c("allègre", "bayrou", "bernard tapie",
              "bernadette chirac", "besson", "bettencourt", "borloo","boutin",
              "cécile duflot", "chirac", "christiane taubira", "clearstream",
              "cohn-bendit", "copé", "darcos", "dati", "de gaulle", "droite",
              "dsk", "elysée", "européennes", "eva joly", "extrême droite",
              "fabius", "fillon", "florian philippot", "fn", "françois hollande",
              "françois mitterrand", "frédéric mitterrand", "gauche", "grenelle",
              "guéant", "hortefeux", "hulot", "jean-luc mélenchon", "jean-marc ayrault",
              "jean-yves le drian", "jérôme cahuzac", "juppé", "kouchner",
              "lagarde", "le pen", "législatives", "les verts", "luc chatel",
              "luc ferry", "mam", "manuel valls", "marine le pen", "martine aubry",
              "michel sapin", "ministre", "montebourg", "municipales",
              "nadine morano", "najat belkacem", "nathalie kosciusko-morizet",
              "patrick buisson", "pierre moscovici", "président", "présidentielle",
              "présidentielle 2007", "présidentielle 2012", "présidentielle 2017",
              "primaires", "ps", "rama yade", "régionales", "rolex",
              "roselyne bachelot", "sarkozy", "ségolène royal", "sénat", "sénateur",
              "ump", "valérie trierweiler", "villepin", "woerth",
              "xavier darcos") ] = "Politique"

  # économie : banques, capitalisme, industrie
  y[ x %in% c("air france", "airbus", "banques", "bernard arnault", "bnp", "bourse",
              "capitalisme", "crédit lyonnais", "crise", "crise économique 2009",
              "crise financière 2008", "entreprises", "florange", "fmi", "krach", "lcl",
              "les firmes", "les riches", "medef", "parisot", "stock-options", "trader",
              "wall street") ] = "Capitalisme"

  # showbiz et assimilés
  y[ x %in% c("anne sinclair", "bigard", "carla bruni", "closer", "delarue",
              "dieudonné", "frigide barjot", "galliano", "gérard depardieu",
              "georges lautner", "houellebecq",
              "jean-pierre pernaut", "johnny hallyday", "madonna", "michael jackson",
              "mireille mathieu", "miss france", "polanski", "raymond domenech", "yannick noah",
              "zemmour") ] = "Showbiz"

  # politique internationale
  y[ x %in% c("afghanistan", "ahmadinejad", "algérie", "allemagne", "bachar al-assad",
    "belgique", "ben ali", "berlusconi", "brésil", "bush", "durban", "egypte",
    "gaza", "grèce", "haïti", "irak", "israël", "israël-palestine", "japon",
    "kadhafi", "les dictatures", "libye", "mali", "maroc", "merkel", "mitt romney",
    "moubarak", "nigeria", "norvège", "obama", "palestine", "poutine", "pussy riot", "qatar",
    "rio", "roumanie", "russie", "suisse", "syrie", "tunisie", "ukraine", "usa" ) ] = "International"

  # terrorisme
  y[ x %in% c("al-qaida", "11 septembre", "ben laden", "jihad", "mohamed merah", "otage",
  "terrorisme") ] = "Terrorisme"

  # religion
  y[ x %in% c("antisémitisme", "athées", "benoit xvi", "bible", "burka", "charia", "coran",
              "dieu", "évêque", "françois 1er", "intégrisme", "jesus", "laïcité", "lourdes",
              "mahomet", "opus dei", "pape", "prophète", "ramadan", "religion", "torah",
              "vatican", "voile") ] = "Religion"

  # armée et police
  y[ x %in% c("14 juillet", "armée", "armes", "crs", "police") ] = "Armée/Police"

  # violence
  y[ x %in% c("assassinat", "attentats", "discrimination", "guerre", "homophobie",
              "incendie criminel", "massacre", "parricide", "pédophilie", "sexisme",
              "torture", "tueur de masse", "violence", "xénophobie") ] = "Violence"

  # médias
  y[ x %in% c("cavanna", "le monde", "libération", "figaro", "france télévision", "hara-kiri",
              "radio", "radio france", "siné", "télé", "tf1") ] = "Medias"

  # residuals
  # 'bac' is the diploma, not the police unit
  # 'bizutage' is not necessarily violent
  y[ x %in% c("2022", "absentéisme", "agriculture", "alcoolisme",
              "alimentation", "amende", "anniversaire", "astérix", "aviation", "avion", "bac",
              "ballerine", "banditisme", "bateau", "batman", "bizutage", "bonne année", "bonus",
              "budget", "ça déraille", "carbone", "caricature", "catastrophes naturelles",
              "charter", "chômage", "cinéma", "climat", "clonage", "clowns",
              "cocaïne", "colonialisme", "coluche", "concept cars", "consommation",
              "corruption", "crash test", "croissance",
              "délation", "démocratie", "derrick", "dimanche", "disparus", "drh", "drogue",
              "ebola", "école", "écologie", "économie",
              "écoute", "education nationale", "élections", "elvis",
              "émigration", "enseignement", "environnement",
              "épidémie", "essence", "été", "europe", "examen", "expulsions",
              "facebook", "famille", "farine animale", "femen",
              "foot", "france", "france télécom",
              "gigolo", "glou glou", "grève", "grippe",
              "hitler", "humanisme", "immigration", "impôts", "intempéries", "internet",
              "iron man", "jeanne d'arc", "jo", "journal", "journalisme", "journée de...",
              "justice", "l'oréal",
              "la poste", "le cimetière", "livres", "logement", "lois",
              "malaise", "manifestations", "mariage",
              "mc donald's", "médicament", "mer", "mode",
              "nazisme", "négationnisme", "népotisme", "noël",
              "nucléaire", "nudisme", "ouverture",
              "patrimoine", "pétrole", "phallocrate", "pharmaceutique",
              "philosophie", "pip", "pirates", "pitbulls",
              "plage", "pma", "pollution", "poséidon adventure",
              "préservatif", "presse",
              "prisons", "privatisation", "prof", "prostitution", "pub",
              "recyclage", "réforme", "régime",
              "remaniement", "rentrée", "répression", "retraite",
              "révolution", "rigueur", "rumeur", "santé", "science",
              "sdf", "sécurité", "sécurité sociale",
              "sexualité", "smic", "social",
              "soeur emmanuelle", "sondages", "sotchi", "sport", "st valentin",
              "suicide", "surpopulation", "taser", "taxe", "téléphone", "téléthon",
              "tempête", "théâtre", "totalitarisme", "train", "travail", "tva", "twitter",
              "université", "vacances", "vache folle",
              "vie privée", "voeux", "voiture") ] = "Varia"

  # simplify
  y[ y %in% c("Showbiz", "Politique") ] = "France"
  y[ y %in% c("Capitalisme", "Armée/Police", "Medias") ] = "Varia"

  return(y)

}
	#
	# download 338 Charlie Hebdo covers with keywords
	#

	library(dplyr)
	library(XML)
	library(lubridate)
	library(stringr)
	library(ggplot2)

	source("charlie-themes.r")

	dir.create("pages", showWarnings = FALSE)
	dir.create("covers", showWarnings = FALSE)

	covers = "charlie.csv"
	if(!file.exists(covers)) {

	d = data.frame()
	for(i in seq(330, 0, -10)) {

	cat("Parsing page", sprintf("%2.0f", i / 10))
	file = paste0("pages/page-", i, ".html")

	if(!file.exists(file))
	h = try(download.file(paste0("http://stripsjournal.canalblog.com/tag/Les%20Unes%20de%20Charlie%20Hebdo/p",
	i, "-0.html"), file, mode = "wb", quiet = TRUE), silent = TRUE)

	if(!file.info(file)$size) {

	cat(": failed\n")
	file.remove(file)

	} else {

	h = htmlParse(file)

	addr = xpathSApply(h, "//div[@class='blogbody']//meta[@itemprop='url']/@content")
	date = xpathSApply(h, "//div[@class='blogbody']//meta[@itemprop='dateCreated']/@content")
	titl = xpathSApply(h, "//div[@class='blogbody']//h3", xmlValue)

	dat = gsub("(.) - (.) - (.*)", "\\3", titl)
	dat = gsub("(.)- (.)", "\\2", dat)
	dat = gsub("octore", "octobre", dat)
	dat = gsub("sept\\.", "sep.", dat)
	dat = parse_date_time(gsub("\\.", "", dat), "%d %m %y", locale = "fr_FR")
	dat = as.Date(dat)

	# fix one unparsable date
	dat[ addr == "http://stripsjournal.canalblog.com/archives/2012/11/14/26045046.html" ] =
	as.Date("2012-11-14")

	file = gsub("(.) - (.) - (.*)", "\\2 - \\1", titl)
	file = gsub("Charlie Hebdo Nª", "", file)
	file = paste(dat, file)

	kwd = xpathSApply(h, "//div[@class='blogbody']//h3/following-sibling::div[@class='itemfooter'][1]")
	kwd = lapply(kwd, xpathSApply, "a[@rel='tag']", xmlValue)

	# fix a few keywords
	kwd = lapply(kwd, function(x) {
	y = tolower(x[ !x %in% c("Charlie Hebdo", "Les Unes de Charlie Hebdo") ])
	y[ y == "aubry" ] = "martine aubry"
	y[ y == "crise économique 2009..." ] = "crise économique 2009"
	y[ y == "tapie" ] = "bernard tapie"
	y[ y == "emmigration" ] = "émigration"
	y[ y == "univercité" ] = "université"
	return(gsub("œ", "oe", y))
	})
	kwd = sapply(kwd, paste0, collapse = ";")

	aut = xpathSApply(h, "//div[@class='blogbody']//h3/following-sibling::div[@class='itemfooter'][2]")
	aut = lapply(aut, xpathSApply, "a[contains(@href, 'archives')]", xmlValue)
	aut = sapply(aut, head, 1)

	# img = xpathSApply(h, "//div[@class='blogbody']//h3/following-sibling::p//img/@src")
	img = xpathSApply(h, "//div[@class='blogbody']//h3/following-sibling::div[@class='itemfooter'][1]/following-sibling::p[2]")
	img = sapply(img, xpathSApply, "a/img/@src")
	img = sapply(img, function(x) ifelse(is.null(x), NA, x))

	# fix six parser errors
	img[ addr == "http://stripsjournal.canalblog.com/archives/2012/04/30/24141374.html" ] =
	"http://p0.storage.canalblog.com/06/27/177230/75268181.jpg"
	img[ addr == "http://stripsjournal.canalblog.com/archives/2013/11/20/28478698.html" ] =
	"http://p2.storage.canalblog.com/22/38/177230/91674307.jpg"
	img[ addr == "http://stripsjournal.canalblog.com/archives/2013/11/12/28419822.html" ] =
	"http://p1.storage.canalblog.com/15/42/177230/91450058_o.jpg"
	img[ addr == "http://stripsjournal.canalblog.com/archives/2013/11/05/28369503.html" ] =
	"http://p2.storage.canalblog.com/28/47/177230/91251269_o.jpg"
	img[ addr == "http://stripsjournal.canalblog.com/archives/2013/10/29/28316947.html" ] =
	"http://p4.storage.canalblog.com/44/10/177230/91032930_o.jpg"
	img[ addr == "http://stripsjournal.canalblog.com/archives/2015/01/12/31306656.html" ] =
	"http://p6.storage.canalblog.com/68/20/177230/101510919_o.png"

	file = paste0(file, gsub("(.*)\\.(gif\|jpg\|png)", ".\\2", img))

	d = rbind(d, data.frame(post_page = i,
	post_url = addr, post_date = date, post_title = titl,
	date = dat, tags = kwd, author = aut, image = img,
	file, stringsAsFactors = FALSE))

	cat(":", sprintf("%3.0f", nrow(d)), "total covers\n")

	}

	}

	# fix three missing authors
	d$author[ d$author == "Index Dessinateurs" ] = NA
	d$author[ d$post_url == "http://stripsjournal.canalblog.com/archives/2009/10/28/30971837.html" ] =
	"Luz"
	d$author[ d$post_url == "http://stripsjournal.canalblog.com/archives/2014/05/21/30964112.html" ] =
	"Cabu"
	d$author[ d$post_url == "http://stripsjournal.canalblog.com/archives/2014/05/14/30964082.html" ] =
	"Cabu"

	write.csv(d, covers, row.names = FALSE)

	}

	d = read.csv(covers, stringsAsFactors = FALSE)

	for(i in which(!is.na(d$image))) {

	file = paste0("covers/", d$file[ i ])

	if(!file.exists(file))
	try(download.file(d$image[ i ], file, quiet = TRUE), silent = TRUE)

	}

	# dates
	print(table(year(d$date), exclude = NULL))

	d$year = year(d$date)
	d$quarter = paste0(d$year, "_", quarter(d$date))

	# numérotation
	d$num = str_extract(d$post_title, "°\\d+")
	d$num[ is.na(d$num) ] = str_extract(d$post_title[ is.na(d$num) ], "Charlie Hebdo \\d+")
	d$num = gsub("\\D", "", d$num)
	stopifnot(n_distinct(d$num) == nrow(d))

	# authors
	print(table(d$author, exclude = NULL))

	# keywords
	terms = unlist(strsplit(d$tags, ";"))

	cat("\n", length(unique(terms)), "unique keywords:\n")
	print(table(terms)[ table(terms) > quantile(table(terms), .99) ])

	# themes
	stopifnot(!length(terms[ is.na(themes(terms)) ]))

	# complete edge list
	full = data.frame()
	for(i in 1:nrow(d)) {

	y = unlist(strsplit(d$tags[ i ], ";"))
	full = rbind(full, data.frame(
	expand.grid(i = y, j = y, stringsAsFactors = FALSE),
	author = d$author[ i ],
	quarter = d$quarter[ i ],
	weight = 1 / (1 + str_count(d$tags[ i ], ";")) # inverse weighting
	))

	}

	csv = data.frame(tag = unique(terms), theme = themes(unique(terms))) %>%
	arrange(tag)

	head(csv)

	csv$nums = NA
	K = strsplit(d$tags, ";")
	for(i in csv$tag) {

	k = lapply(K, function(x) i %in% x)
	k = as.numeric(d$num[ which(sapply(k, isTRUE)) ])
	if(length(k))
	csv$nums[ csv$tag == i ] = paste0(k [ order(k) ], collapse = ";")

	}

	write.csv(csv, "charlie-tags.csv", row.names = FALSE)

	#
	# Thematic plots
	#

	th = c(
	"Armée/Police" = "Varia",
	"Capitalisme" = "Varia",
	"France" = "Varia",
	"International" = "International",
	"Medias" = "Varia",
	"Politique" = "Politique",
	"Religion" = "Religion",
	"Showbiz" = "Varia",
	"Terrorisme" = "Violence",
	"Varia" = "Varia",
	"Violence" = "Violence"
	)

	yy = lapply(d$tags, function(x) {
	y = unlist(strsplit(x, ";"))
	y = th[ themes(y, simplify = F) ]
	unique(y)
	})

	dd = data.frame()
	for(i in unique(d$year)) {
	y = unlist(yy[ d$year == i ])
	print(i)
	print(table(y))
	dd = rbind(dd, data.frame(year = i, theme = names(table(y)),
	freq = as.vector(table(y)),
	prop = as.vector(table(y) / sum(table(y)))))
	}

	colors = c(
	"Politique" = "#80b1d3", # Set3 / light blue
	"International" = "#fdb462", # Set3 / light orange
	"Religion" = "#4daf4a", # Set1 / bright green
	"Terrorisme" = "#e41a1c", # Set1 / bright red
	"Varia" = "#999999", # Set1 / grey
	"Violence" = "#984ea3" # Set1 / bright purple
	)

	qplot(data = dd, fill = theme, y = prop, x = factor(year), stat = "identity", geom = "bar") +
	scale_fill_manual("Thème", values = colors2) +
	theme_bw() +
	theme(panel.grid = element_blank()) +
	labs(y = "Fréquence relative\n", x = "\nAnnée")

	ggsave("plots/charlie_themes_prop.pdf", width = 9, height = 8)

	qplot(data = dd, fill = theme, y = freq, x = factor(year), stat = "identity", geom = "bar") +
	scale_fill_manual("Thème", values = colors2) +
	theme_bw() +
	theme(panel.grid = element_blank()) +
	labs(y = "Fréquence brute\n", x = "\nAnnée")

	ggsave("plots/charlie_themes_freq.pdf", width = 9, height = 8)

	# done
	#
	# quick and dirty ERGM specification for the co-occurrence network
	# NB: to be run after the exploratory models in charlie-models.r
	#

	# edge list

	e = subset(full[, 1:2 ], i != j)
	e$u = apply(e, 1, function(x) paste0(sort(x), collapse = "///"))
	e = data.frame(table(e$u))
	e = data.frame(i = gsub("(.)///(.)", "\\1", e$Var1),
	j = gsub("(.)///(.)", "\\2", e$Var1),
	n = e$Freq / 2)

	# network

	n = network(e[, 1:2 ], directed = FALSE)
	t = themes(network.vertex.names(n), simplify = FALSE)

	n %n% "covers" = 338
	n %v% "theme" = t

	set.edge.attribute(n, "weight", ifelse(e$n > 10, 10, e$n))
	print(table(n %e% "weight"))

	# > subset(e, n > 10)
	# i j n
	# 779 élections présidentielle 23
	# 781 élections présidentielle 2012 22
	# 789 élections sarkozy 17
	# 871 extrême droite fn 17
	# 880 extrême droite marine le pen 14
	# 961 fn marine le pen 15
	# 1189 intégrisme religion 17
	# 1563 présidentielle 2012 sarkozy 15
	# 1571 présidentielle présidentielle 2012 22
	# 1575 présidentielle sarkozy 15

	th = c(
	"Armée/Police" = "Varia",
	"Capitalisme" = "Varia",
	"France" = "Varia",
	"International" = "International",
	"Medias" = "Varia",
	"Politique" = "Politique",
	"Religion" = "Religion",
	"Showbiz" = "Varia",
	"Terrorisme" = "Violence",
	"Varia" = "Varia",
	"Violence" = "Violence"
	)

	# collapse themes
	n %v% "theme" = th[ n %v% "theme" ]

	print(table(n %v% "theme", exclude = NULL))

	E = ergm(n ~ nodemix("theme"))
	b = which(names(coef(E)) %in% c("mix.theme.France.France",
	"mix.theme.International.International",
	"mix.theme.Politique.Politique",
	"mix.theme.Religion.Religion",
	"mix.theme.Showbiz.Showbiz",
	"mix.theme.Terrorisme.Terrorisme",
	"mix.theme.Varia.Varia",
	"mix.theme.Violence.Violence"))

	b = c(b, which(grepl("Varia", names(coef(E)))))

	#
	# Baseline model
	#

	B = ergm(n ~ edges +
	nodefactor("theme", base = 4) +
	nodematch("theme", diff = TRUE, keep = c(1:3, 5)) +
	nodemix("theme", base = b),
	parallel = 4,
	control = control.ergm(seed = 4575,
	MCMLE.termination = "precision",
	MCMLE.effectiveSize = 50, # enable adaptive burnin and interval
	MCMLE.MCMC.precision = .01, # control how much noise is tolerable
	MCMLE.maxit = 50))

	print(summary(B))

	#
	# GWDSP-adjusted model
	#

	D = ergm(n ~ edges +
	nodefactor("theme", base = 4) +
	nodematch("theme", diff = TRUE, keep = c(1:3, 5)) +
	nodemix("theme", base = b) +
	gwdsp(1, fixed = TRUE),
	parallel = 4,
	control = control.ergm(seed = 4575,
	MCMLE.termination = "precision",
	MCMLE.effectiveSize = 50, # enable adaptive burnin and interval
	MCMLE.MCMC.precision = .01, # control how much noise is tolerable
	MCMLE.maxit = 50))

	print(summary(D))

	#
	# GWD-adjusted models, at various values of alpha decay
	#

	a = seq(0, 5, by = 0.1)
	alphas = c()
	L = list()

	for(i in a) {

	cat("\nAttempting to fit at decay alpha =", i, "\n")

	E = try(ergm(n ~ edges +
	nodefactor("theme", base = 4) +
	nodematch("theme", diff = TRUE, keep = c(1:3, 5)) +
	nodemix("theme", base = b) +
	gwdegree(i, fixed = TRUE), # gwesp(1, fixed = TRUE), gwdsp(1, fixed = TRUE),
	parallel = 4,
	control = control.ergm(seed = 4575,
	MCMLE.termination = "precision",
	MCMLE.effectiveSize = 50, # enable adaptive burnin and interval
	MCMLE.MCMC.precision = .01, # control how much noise is tolerable
	MCMLE.maxit = 50)), silent = TRUE)

	if(!"try-error" %in% class(E)) {

	print(summary(E))
	L[[ which(a == i) ]] = E
	alphas = c(alphas, i)

	}

	}

	L = L[ !sapply(L, is.null) ]
	r = data.frame()

	for(i in 1:length(L)) {

	r = rbind(r, data.frame(
	x = names(coef(L[[ i ]])),
	b = summary(L[[ i ]])$coefs[, 1],
	se = summary(L[[ i ]])$coefs[, 2],
	alpha = alphas[ i ],
	stringsAsFactors = FALSE))

	}

	r = bind_rows(r)
	r$ub = r$b + 2 * r$se
	r$lb = r$b - 2 * r$se

	# see how the GWD term moves: a < 1 and a > 3 are alright
	qplot(data = subset(r, grepl("gwd", x)),
	x = factor(alpha), y = b, ymin = lb, ymax = ub, geom = "pointrange") +
	geom_hline(y = 0, lty = "dashed") +
	facet_wrap(~ alpha, scales = "free", nrow = 1) +
	labs(y = "Coefficient estimate\n", x = "\nGWD alpha decay parameter") +
	theme_bw() +
	theme(panel.grid = element_blank())

	r = subset(r, alpha < 1 \| alpha > 3)

	qplot(data = subset(r, grepl("nodefactor", x)),
	x = factor(alpha), y = b, ymin = lb, ymax = ub, geom = "pointrange") +
	geom_hline(y = 0, lty = "dashed") +
	facet_wrap(~ x) +
	theme_bw()

	qplot(data = subset(r, grepl("nodematch", x)),
	x = factor(alpha), y = b, ymin = lb, ymax = ub, geom = "pointrange") +
	geom_hline(y = 0, lty = "dashed") +
	facet_wrap(~ x) +
	theme_bw()

	qplot(data = subset(r, grepl("mix", x)),
	x = factor(alpha), y = b, ymin = lb, ymax = ub, geom = "pointrange") +
	geom_hline(y = 0, lty = "dashed") +
	facet_wrap(~ x) +
	labs(y = "Coefficient estimate\n", x = "\nGWD alpha decay parameter") +
	theme_bw() +
	theme(panel.grid = element_blank())

	save(B, D, L, alphas, n, file = "charlie-ergm.rda")

	# have a nice day
	#
	# quick and dirty ERGM specifications for the co-occurrence networks
	#

	library(ergm)
	library(ergm.count)

	coefs = data.frame()

	for(i in ls(pattern = "^net_")) {

	nn = get(i)

	# collapse general themes
	nn %v% "theme" = ifelse(nn %v% "theme" %in% c("France", "International"), "Varia", nn %v% "theme")

	cat("\n", i, ":\n\n")
	print(table(nn %v% "theme", exclude = NULL))

	E = ergm(nn ~ nodemix("theme"))
	b = which(names(coef(E)) %in% c("mix.theme.France.France", # useless
	"mix.theme.International.International",
	"mix.theme.Religion.Religion",
	"mix.theme.Terrorisme.Terrorisme",
	"mix.theme.Varia.Varia",
	"mix.theme.Violence.Violence"))

	b = c(b, which(grepl("Varia", names(coef(E)))))

	if(!exists(paste0("ergm_", i))) {

	# binary model for all networks
	E = ergm(nn ~ edges + nodefactor("theme") + nodemix("theme", base = b),
	control = control.ergm(seed = 4575,
	MCMLE.termination = "precision",
	MCMLE.effectiveSize = 50, # enable adaptive burnin and interval
	MCMLE.MCMC.precision = .01, # control how much noise is tolerable
	MCMLE.maxit = 50))

	# add weighted model for author-specific networks
	if(!grepl("\\d", i)) {

	W = try(ergm(nn ~ nonzero + sum + nodefactor("theme") + nodemix("theme", base = b),
	response = "weight", reference = ~ Poisson,
	control = control.ergm(seed = 4575,
	MCMC.prop.weights = "0inflated",
	MCMLE.trustregion = 1000,
	MCMLE.termination = "precision",
	MCMLE.effectiveSize = 50, # enable adaptive burnin and interval
	MCMLE.MCMC.precision = .01, # control how much noise is tolerable
	MCMLE.maxit = 50)))

	assign(paste0("wergm_", i), W)

	}

	} else {

	E = get(paste0("ergm_", i))

	}

	assign(paste0("ergm_", i), E)

	coefs = rbind(coefs, data.frame(
	x0 = "B",
	network = gsub("net_", "", i),
	x = names(coef(E)),
	b = coef(E),
	se = summary(E)$coefs[ , 2 ],
	stringsAsFactors = FALSE
	))

	if(exists(paste0("wergm_", i))) {

	W = get(paste0("wergm_", i))

	if(!"try-error" %in% class(W))
	coefs = rbind(coefs, data.frame(
	x0 = "W",
	network = gsub("net_", "", i),
	x = names(coef(W)),
	b = coef(W),
	se = summary(W)$coefs[ , 2 ],
	stringsAsFactors = FALSE
	))

	}

	}

	#
	# temporal ERGM results
	#

	qplot(data = subset(coefs, grepl("20(09\|10\|11\|12\|13\|14)", network) &
	x0 == "B" & grepl("mix", x) & !is.infinite(b) & se < 10),
	y = b, x = x0, color = x) +
	geom_segment(aes(xend = x0, y = b - 2 * se, yend = b + 2 *se)) +
	geom_hline(y = 0, lty = "dashed") +
	facet_grid(x ~ network) +
	scale_color_brewer("Coefficients", palette = "Set1") +
	labs(x = NULL, y = "log Pr( co-occurrence )\n") +
	theme_bw() +
	theme(axis.text.x = element_blank(),
	axis.ticks.x = element_blank(),
	panel.grid = element_blank(),
	legend.position = "bottom")

	ggsave("ergm_time.pdf", width = 9, height = 8)

	#
	# author-specific ERGM results, binary and valued
	#

	coefs$x = gsub("sum\\.", "", coefs$x)

	qplot(data = subset(coefs, grepl("Cabu\|Charb\|Riss\|Luz", network) &
	grepl("mix", x) & !is.infinite(b) & se < 10),
	y = b, x = x0, color = x) +
	geom_segment(aes(xend = x0, y = b - 2 * se, yend = b + 2 *se)) +
	geom_hline(y = 0, lty = "dashed") +
	facet_grid(x ~ network) +
	scale_color_brewer("Coefficients", palette = "Set1") +
	labs(x = NULL, y = "log Pr( co-occurrence )\n") +
	theme_bw() +
	theme(#axis.text.x = element_blank(),
	#axis.ticks.x = element_blank(),
	panel.grid = element_blank(),
	legend.position = "bottom")

	ggsave("ergm_authors.pdf", width = 9, height = 8)

	# kthxbye
	#
	# build temporal and author-specific co-occurrence networks
	#

	library(animation)
	library(dplyr)
	library(GGally)
	library(ggplot2)
	library(network)

	source("charlie-data.r")

	dir.create("plots", showWarnings = FALSE)

	colors = c(
	"France" = "#80b1d3", # Set3 / light blue
	"International" = "#fdb462", # Set3 / light orange
	"Religion" = "#4daf4a", # Set1 / bright green
	"Terrorisme" = "#e41a1c", # Set1 / bright red
	"Varia" = "#999999", # Set1 / grey
	"Violence" = "#984ea3" # Set1 / bright purple
	)

	#
	# time-specific term networks
	#

	prop = data.frame()

	for(k in names(table(d$quarter)[ table(d$quarter) > 10 \| names(table(d$quarter)) == "2015_1" ])) {

	# remove self-loops and select author
	e = subset(full, i != j & quarter == k)[, 1:2 ]

	cat("Building:", k, nrow(e), "edges\n")

	# aggregate similar rows
	e$u = apply(e, 1, function(x) paste0(sort(x), collapse = "///"))
	e = data.frame(table(e$u))

	e = data.frame(i = gsub("(.)///(.)", "\\1", e$Var1),
	j = gsub("(.)///(.)", "\\2", e$Var1),
	n = e$Freq / 2)

	n = network(e[, 1:2 ], directed = FALSE)
	t = themes(network.vertex.names(n))

	n %n% "period" = k
	n %v% "theme" = t

	set.edge.attribute(n, "weight", e$n)
	## print(table(n %e% "weight"))

	# compare network dimensions
	nv0 = network.size(n)
	ne0 = network.edgecount(n)
	nd = network.density(n)

	g = ggnet(n, label.nodes = network.vertex.names(n),
	node.group = t, node.color = colors[ names(colors) %in% t ],
	size = 0, label.size = 4, segment.color = "grey50",
	segment.alpha = .5) +
	theme(legend.key = element_blank(), legend.position = "bottom") +
	ggtitle(paste("Période", gsub("_", "-", k),
	"n =", table(d$quarter)[ names(table(d$quarter)) == k ],
	"'unes'", nv0, "mots-clés", ne0, "co-occurrences\n"))

	ggsave(paste0("plots/network_", k, ".pdf"), g, width = 12, height = 12)

	assign(paste0("plot_", k), g)
	assign(paste0("net_", k), n)
	assign(paste0("edges_", k), e)

	# subset to selected themes
	t = t %in% c("Religion", "Terrorisme", "Violence")
	delete.vertices(n, which(!t))

	prop = rbind(prop, data.frame(t = k,
	n = table(d$quarter)[ names(table(d$quarter)) == k ],
	d = nd,
	nv0, nv1 = network.size(n),
	ne0, ne1 = network.edgecount(n),
	stringsAsFactors = FALSE))

	print(tail(prop, 1))

	}

	stop()

	# dimensions

	summary(prop$d)

	prop$pv = with(prop, nv1 / nv0)
	prop$pe = with(prop, ne1 / ne0)

	# plot

	prop$year = substr(prop$t, 1, 4)
	prop$quarter = substr(prop$t, 6, 7)

	prop = arrange(prop, -pe) %>%
	mutate(color = 1:n()) %>%
	mutate(color = cut(color, breaks = c(1, 3, 7, Inf), include.lowest = TRUE))

	qplot(data = subset(prop, year %in% 2009:2014),
	x = quarter, y = pe, group = year, fill = color,
	stat = "identity", geom = "bar") +
	geom_hline(y = prop$pe[ prop$t == "2015_1" ], lty = "dashed") +
	facet_wrap(~ year, nrow = 1) +
	scale_fill_manual("", values = c("[1,3]" = "#2c7fb8", "(3,7]" = "#7fcdbb", "(7,Inf]" = "#edf8b1")) +
	guides(fill = FALSE) +
	labs(y = "Proportion des liens 'Religion, Terrorisme, Violence'\n",
	x = "\nPériode annuelle : 1. janvier-mars 2. avril-juin 3. juillet-septembre 4. octobre-décembre") +
	theme_bw() +
	theme(panel.grid.major.x = element_blank())

	ggsave("plots/prop_edges.pdf", width = 9, height = 7)

	prop = arrange(prop, -pv) %>%
	mutate(color = 1:n()) %>%
	mutate(color = cut(color, breaks = c(1, 3, 7, Inf), include.lowest = TRUE))

	qplot(data = subset(prop, year %in% 2009:2014),
	x = quarter, y = pv, group = year, fill = color,
	stat = "identity", geom = "bar") +
	geom_hline(y = prop$pv[ prop$t == "2015_1" ], lty = "dashed") +
	facet_wrap(~ year, nrow = 1) +
	scale_fill_manual("", values = c("[1,3]" = "#2c7fb8", "(3,7]" = "#7fcdbb", "(7,Inf]" = "#edf8b1")) +
	guides(fill = FALSE) +
	labs(y = "Proportion des mots-clés 'Religion, Terrorisme, Violence'\n",
	x = "\nPériode annuelle : 1. janvier-mars 2. avril-juin 3. juillet-septembre 4. octobre-décembre") +
	theme_bw() +
	theme(panel.grid.major.x = element_blank())

	ggsave("plots/prop_nodes.pdf", width = 9, height = 7)

	# movie

	saveGIF({
	for(i in sort(ls(pattern = "plot_\\d+")))
	print(get(i))
	}, movie.name = "network_quarters.gif", ani.width = 600, ani.height = 600)

	#
	# author-specific term networks
	#

	for(k in names(table(d$author)[ table(d$author) > 50 ])) {

	# remove self-loops and select author
	e = subset(full, i != j & author == k)[, 1:2 ]

	cat("Building:", k, nrow(e), "edges\n")

	# aggregate similar rows
	e$u = apply(e, 1, function(x) paste0(sort(x), collapse = "///"))
	e = data.frame(table(e$u))
	e = data.frame(i = gsub("(.)///(.)", "\\1", e$Var1),
	j = gsub("(.)///(.)", "\\2", e$Var1),
	n = e$Freq / 2)

	n = network(e[, 1:2 ], directed = FALSE)
	t = themes(network.vertex.names(n))

	n %n% "period" = k
	n %v% "theme" = t

	set.edge.attribute(n, "weight", e$n)
	## print(table(n %e% "weight"))

	assign(paste0("plot_", k), g)
	assign(paste0("net_", k), n)
	assign(paste0("edges_", k), e)

	ggnet(n, label.nodes = network.vertex.names(n),
	node.group = t, node.color = colors[ names(colors) %in% t ],
	size = 0, label.size = 4, segment.color = "grey50",
	segment.alpha = .5) +
	theme(legend.key = element_blank(), legend.position = "bottom") +
	ggtitle(paste(k, ", n =", table(d$author)[ names(table(d$author)) == k ],
	"'unes'", network.size(n), "mots-clés",
	network.edgecount(n), "co-occurrences\n"))

	ggsave(paste0("plots/network_", k, ".pdf"), width = 12, height = 12)

	}

	# done
tag	theme	nums
11 septembre	Terrorisme	848
14 juillet	Varia	996
2022	Varia	1177
absentéisme	Varia	991
afghanistan	International	850
agriculture	Varia	974;1116
ahmadinejad	International	879
air france	Varia	885
airbus	Varia	885
al-qaida	Terrorisme	985
alcoolisme	Varia	935
algérie	International	971
alimentation	Varia	1079
allègre	France	929
allemagne	International	1039;1065;1110
amende	Varia	884
anne sinclair	France	990;994
anniversaire	Varia	881;933
antisémitisme	Religion	879;977;1013;1031;1124;1125;1153
armée	Varia	850;992;996;1075;1099;1106
armes	Varia	1133
assassinat	Violence	1031;1032;1062
astérix	Varia	1061
athées	Religion	847;983
attentats	Violence	1118
aviation	Varia	885;890
avion	Varia	885;890;1154
bac	Varia	887;991;1096
bachar al-assad	International	1026;1106
ballerine	Varia	1005
banditisme	Varia	1107;1109
banques	Varia	849;852;1013;1044;1102;1179
bateau	Varia	1022
batman	Varia	1049
bayrou	France	886;898
belgique	International	1056;1069
ben ali	International	976
ben laden	Terrorisme	959;985
benoit xvi	Religion	847;877;928;962;1041;1070;1078;1080;1081
berlusconi	International	1081
bernadette chirac	France	1128
bernard arnault	Varia	1056
bernard tapie	France	1096;1102
besson	France	880;900;906;908;914;921;952
bettencourt	France	942;944;945
bible	Religion	983
bigard	France	848
bizutage	Varia	1039
bnp	Varia	1013
bonne année	Varia	915
bonus	Varia	895
borloo	France	801
bourse	Varia	849;851;895;999
boutin	France	1092
brésil	International	862;873;1101
bretagne	France	1116;1117
budget	Varia	1140
burka	Religion	888;920
bush	International	855
ça déraille	Varia	1100
capitalisme	Varia	876
carbone	Varia	899
caricature	Varia	1057
carla bruni	France	914;927;931;946;1003;1006;1027;1030;1161
catastrophes naturelles	Varia	918;978
cavanna	Varia	1129
cécile duflot	France	1132
charia	Religion	1010;1011
charter	Varia	950
chirac	France	976;1000;1004
chômage	Varia	850;868;937;1048;1079
christiane taubira	France	1132
cinéma	Varia	831;928;999;1091;1119;1151
clearstream	France	901
climat	Varia	924;925;929
clonage	Varia	912
closer	France	1174
clowns	Varia	1168
cocaïne	Varia	882
cohn-bendit	France	886
colonialisme	Varia	1153
coluche	Varia	852;1171
concept cars	Varia	957
consommation	Varia	1018
copé	France	1040;1053;1063;1066;1067;1070;1094;1105
coran	Religion	983;1099
corruption	Varia	973
corse	France	1062
crash test	Varia	885;890;1154
crédit lyonnais	Varia	1102
crise	Varia	849;851;852;853;860;868;872;877;891;896;937;999;1047;1052;1090;1100;1140;1152;1157
crise économique 2009	Varia	868;872;877;891;896;937
crise financière 2008	Varia	849;851;852;853;860
croissance	Varia	870;896;1104;1125;1152;1157
crs	Varia	949;958
darcos	France	910
dati	France	854
de gaulle	France	939;1093
delarue	France	1054
délation	Varia	880
démocratie	Varia	1094
derrick	Varia	861
dieu	Religion	1064;1108
dieudonné	France	1124;1125
dimanche	Varia	1111
discrimination	Violence	966
disparus	Varia	885
drh	Varia	903
drogue	Varia	1014;1054
droite	France	887;926;1093
dsk	France	968;987;989;990;993;994;998;1005;1010;1015;1051;1068;1081;1086;1134;1143;1160
durban	International	879
ebola	Varia	1156
école	Varia	1073;1108;1130;1142
écologie	Varia	801;899;982;1116
économie	Varia	870;872;895;923;934;1104;1110;1125;1140;1141;1152;1157
écoute	Varia	1003
education nationale	Varia	865;883;1142
egypte	International	972;1045;1099
élections	Varia	729;831;846;855;856;885;886;925;932;970;978;980;982;994;1002;1007;1009;1020;1023;1024;1025;1027;1028;1029;1031;1032;1033;1034;1035;1036;1037;1038;1042;1043;1063;1066;1067;1094;1110;1135;1136;1177
elvis	Varia	985
elysée	France	1176
émigration	Varia	1112
enseignement	Varia	865;883
entreprises	Varia	910;1141
environnement	Varia	801;899;929;1116;1117
épidémie	Varia	892;909;1156
essence	Varia	957;1001
été	Varia	890;891;1048;1103;1104;1156;1157
europe	Varia	885;934;953;1013;1017;1039;1065;1112;1144
européennes	France	885;886
eva joly	France	996
évêque	Religion	873
examen	Varia	883
expulsions	Varia	1114
extrême droite	France	921;926;932;965;978;984;997;1028;1029;1031;1036;1043;1066;1070;1095;1109;1112;1135;1147;1172;1174
fabius	France	1087;1089
facebook	Varia	935
famille	Varia	1164
farine animale	Varia	1079
femen	Varia	1053;1081
figaro	Varia	1113
fillon	France	868;926;961;964;996;1014;1040;1053;1063;1066;1067;1070;1105;1169
florange	Varia	1059;1068
florian philippot	France	1174
fmi	Varia	987;989
fn	France	921;926;932;978;979;984;997;1031;1036;1043;1066;1095;1109;1112;1135;1136;1144;1145;1147;1150;1165;1172;1174
foot	Varia	937;938;939;940;986;1115;1148;1149
france	Varia	918;1039;1065;1106;1162
france télécom	Varia	902
france télévision	Varia	864
françois 1er	Religion	1082;1083;1101;1179
françois hollande	France	846;1008;1009;1023;1033;1038;1039;1044;1045;1047;1048;1052;1055;1061;1072;1075;1086;1089;1090;1091;1098;1103;1107;1110;1114;1119;1121;1122;1123;1125;1126;1132;1134;1139;1146;1152;1157;1160;1171;1176
françois mitterrand	France	969;1000
frédéric mitterrand	France	904
frigide barjot	France	1088;1092
galliano	France	977
gauche	France	887;1047;1138
gaza	International	863;865
georges lautner	France	1119
gérard depardieu	France	1069;1070;1133
gigolo	Varia	942
glou glou	Varia	1103;1112
grèce	International	1052
grenelle	France	801
grève	Varia	875;957;1151
grippe	Varia	892;909
guadeloupe	France	871
guéant	France	1014;1025;1029;1040;1053
guerre	Violence	850;894;939;979;985;1001;1075;1106;1107;1155
haïti	International	918
hara-kiri	Varia	1129
hitler	Varia	1013
homophobie	Violence	1012;1046;1064;1073;1074;1088;1092;1093
hortefeux	France	900;948;1040;1053;1105
houellebecq	France	1177
hulot	France	982
humanisme	Varia	991
immigration	Varia	880;906;946;1112
impôts	Varia	853;1024;1056;1069;1086;1106;1113;1115;1116;1117
incendie criminel	Violence	1012
intégrisme	Religion	866;888;917;920;959;1011;1012;1016;1049;1057;1058;1070;1073;1075;1099;1130;1142;1143;1162;1163;1166;1170
intempéries	Varia	1097
internet	Varia	935
irak	International	1170
iron man	Varia	1091
israël	International	863;865;1153
israël-palestine	International	863;865;866;1153
japon	International	978
jean-luc mélenchon	France	1033;1042
jean-marc ayrault	France	1055;1072;1087;1089;1122;1132
jean-pierre pernaut	France	919;960
jean-yves le drian	France	1132
jeanne d'arc	Varia	1145
jérôme cahuzac	France	1086;1087
jesus	Religion	905;966;983;1016;1064;1082;1084;1173
jihad	Terrorisme	1170
jo	Varia	1049;1050;1131
johnny hallyday	France	913;936
journal	Varia	1113;1118
journalisme	Varia	1050
journée de...	Varia	924;1004
juppé	France	1014
justice	Varia	901;945;974;976;987;990;1071;1102
kadhafi	International	975;976;1001
kouchner	France	961
krach	Varia	999
l'oréal	Varia	944;945
la poste	Varia	903
lagarde	France	849;989
laïcité	Religion	981;1108
lcl	Varia	1102
le cimetière	Varia	853;861;889;930;985;1000;1054;1129
le monde	Varia	1003
le pen	France	926;965;1028;1029;1031;1109;1147;1150;1165;1172
législatives	France	1042;1043;1110
les dictatures	International	975
les firmes	Varia	902;944;945;1021;1022;1085;1111
les riches	Varia	1056
les verts	France	996
libération	Varia	1118
libye	International	975;979;1001;1010;1011
livres	Varia	1030;1177
logement	Varia	859
lois	Varia	1046
lourdes	Religion	1097
luc chatel	France	1040
luc ferry	France	991
madonna	France	1042
mahomet	Religion	1163;1178
malaise	Varia	893
mali	International	1074;1075
mam	France	971;973;975
manifestations	Varia	867;874;875;949;951;956;957;975;992;1073;1074;1099;1113;1164
manuel valls	France	1089;1132;1137;1138;1139;1140;1158
mariage	Varia	1046;1074;1076;1088;1092;1093
marine le pen	France	965;970;977;978;979;984;997;1028;1029;1031;1042;1043;1066;1070;1095;1112;1144;1145;1147;1150;1165;1167;1171;1172;1179
maroc	International	971
marseille	France	1107
martine aubry	France	858;898;968;988;995;1008
massacre	Violence	1178
mc donald's	Varia	1168
medef	Varia	875
médicament	Varia	743;916
mer	Varia	890;891;895;944
merkel	International	1039;1065;1110
michael jackson	France	889
michel sapin	France	1132
ministre	France	1137
mireille mathieu	France	1059
miss france	France	964;1120
mitt romney	International	1063
mode	Varia	977
mohamed merah	Terrorisme	1060
montebourg	France	1089;1132;1141;1158
moubarak	International	976
municipales	France	1094;1135;1136
nadine morano	France	1014;1021
najat belkacem	France	1159
nathalie kosciusko-morizet	France	1094
nazisme	Varia	861;1013
négationnisme	Varia	869;879
népotisme	Varia	904
nice	France	1109
nigeria	International	1143
noël	Varia	858;860;914;966;1018;1173
norvège	International	997
nucléaire	Varia	729
nudisme	Varia	1132
obama	International	846;855;866;1107
opus dei	Religion	905
otage	Terrorisme	987
ouverture	Varia	887;897;927
palestine	International	863;865;1153
pape	Religion	847;869;877;928;962;1041;1078;1080;1081;1082;1083;1101;1179
paris	France	1094
parisot	Varia	875
parricide	Violence	1076
patrick buisson	France	1134
patrimoine	Varia	1087
pédophilie	Violence	873;894;928;1073
pétrole	Varia	1001
phallocrate	Varia	1127
pharmaceutique	Varia	916
philosophie	Varia	887;991
pierre moscovici	France	1089
pip	Varia	1021;1022
pirates	Varia	878
pitbulls	Varia	961
plage	Varia	890;891;944;997
pma	Varia	1077
polanski	France	928
police	Varia	884;949;958
pollution	Varia	1116
poséidon adventure	Varia	1022
poutine	International	1053;1131;1133
préservatif	Varia	877;962;994
président	France	1126;1149
présidentielle	France	932;970;980;982;994;1002;1009;1020;1023;1024;1025;1027;1028;1029;1031;1032;1033;1034;1035;1036;1037;1038;1063
présidentielle 2007	France	729
présidentielle 2012	France	932;970;980;982;994;1002;1009;1020;1023;1024;1025;1027;1028;1029;1031;1032;1033;1034;1035;1036;1037;1038
présidentielle 2017	France	1161
presse	Varia	1003;1118;1174
primaires	France	1007;1094
prisons	Varia	854;1053;1175
privatisation	Varia	903
prof	Varia	883
prophète	Religion	905;966;983;1016;1064;1082;1084;1163;1178
prostitution	Varia	1120
provence-alpes-côte d'azur	France	1107;1109
ps	France	831;857;888;968;988;1007;1009;1055;1086;1087;1132;1158
pub	Varia	861;864;982
pussy riot	International	1053
qatar	International	1059
radio	Varia	941
radio france	Varia	941
rama yade	France	908;961
ramadan	Religion	1177
raymond domenech	France	940
recyclage	Varia	1091
réforme	Varia	876
régime	Varia	963
régionales	France	925
régions françaises	France	1116;1117
religion	Religion	847;873;888;917;920;928;962;981;983;1011;1012;1016;1049;1057;1058;1064;1073;1078;1080;1082;1083;1097;1099;1101;1108;1111;1130;1142;1143;1162;1163;1166;1173;1177;1178
remaniement	Varia	954;1132;1158
rentrée	Varia	1002
répression	Varia	975;1026
retraite	Varia	922;930;935;949;950;951;955;1078
révolution	Varia	972;975;1026
rigueur	Varia	1090
rio	International	1101
rolex	France	877
roselyne bachelot	France	909
roumanie	International	950
rumeur	Varia	995;998
russie	International	1131;1133
santé	Varia	892;893;909;911;935;1021;1079;1117;1121
sarkozy	France	743;858;859;862;863;867;868;870;871;872;874;878;881;882;884;893;899;901;904;907;912;914;915;919;921;922;923;925;927;931;933;941;943;944;946;950;953;954;957;964;967;970;974;979;980;1000;1002;1003;1004;1006;1007;1009;1019;1020;1023;1024;1025;1026;1027;1030;1032;1034;1035;1036;1037;1038;1051;1084;1096;1105;1121;1134;1137;1151;1161;1162;1167;1169;1171;1179
science	Varia	917;929
sdf	Varia	859;860;874
sécurité	Varia	946
sécurité sociale	Varia	936
ségolène royal	France	729;831;856;857;858
sénat	France	1006
sénateur	France	1006
sexisme	Violence	924;988;1010;1081;1127;1159
sexualité	Varia	743;1155
siné	Varia	1071
smic	Varia	1148
social	Varia	1077
soeur emmanuelle	Varia	853
sondages	Varia	993;1023;1176
sotchi	Varia	1131
sport	Varia	882;934;937;938;939;940;1049;1050;1098;1115;1131;1148;1149
st valentin	Varia	869
stock-options	Varia	875
suicide	Varia	854;902;903;1086
suisse	International	1087
surpopulation	Varia	1077
syrie	International	1106;1107;1170
taser	Varia	859
taxe	Varia	899;1116;1117
télé	Varia	864;1083;1085;1151;1169;1175
téléphone	Varia	898
téléthon	Varia	911
tempête	Varia	924;925
terrorisme	Terrorisme	955;959;985;1032;1166;1178;1179
tf1	Varia	919;960;1085
théâtre	Varia	1016;1151
torah	Religion	983
torture	Violence	1011
totalitarisme	Varia	975
toulouse	France	1031;1032
trader	Varia	895
train	Varia	1100
travail	Varia	1111
tueur de masse	Violence	1076
tunisie	International	971;972;973;1011
tva	Varia	1024
twitter	Varia	1045
ukraine	International	1133
ump	France	897;925;926;948;958;961;979;1004;1014;1015;1036;1040;1053;1066;1067;1094;1105
université	Varia	883
usa	International	846;851;855;918;987;1063
vacances	Varia	862;890;891;895;972;993;1103;1104
vache folle	Varia	1079
valérie trierweiler	France	1044;1045;1047;1098;1125;1127;1128;1160
vatican	Religion	877;1041;1082
vie privée	Varia	1126
villepin	France	901
violence	Violence	865;883
voeux	Varia	967;1019
voile	Religion	888;1049;1122
voiture	Varia	957
wall street	Varia	851
woerth	France	930;942;943;944;951
xavier darcos	France	865
xénophobie	Violence	900;947;948;949;953;986;996;1026;1114;1124;1125;1145;1165;1167
yannick noah	France	1014
zemmour	France	970;1165;1167
	themes <- function(x) {

	y = rep(NA, length(x))

	# france
	y[ x %in% c("bretagne", "corse", "guadeloupe", "marseille", "nice", "paris",
	"provence-alpes-côte d'azur", "régions françaises",
	"toulouse") ] = "France"

	# politique française
	y[ x %in% c("allègre", "bayrou", "bernard tapie",
	"bernadette chirac", "besson", "bettencourt", "borloo","boutin",
	"cécile duflot", "chirac", "christiane taubira", "clearstream",
	"cohn-bendit", "copé", "darcos", "dati", "de gaulle", "droite",
	"dsk", "elysée", "européennes", "eva joly", "extrême droite",
	"fabius", "fillon", "florian philippot", "fn", "françois hollande",
	"françois mitterrand", "frédéric mitterrand", "gauche", "grenelle",
	"guéant", "hortefeux", "hulot", "jean-luc mélenchon", "jean-marc ayrault",
	"jean-yves le drian", "jérôme cahuzac", "juppé", "kouchner",
	"lagarde", "le pen", "législatives", "les verts", "luc chatel",
	"luc ferry", "mam", "manuel valls", "marine le pen", "martine aubry",
	"michel sapin", "ministre", "montebourg", "municipales",
	"nadine morano", "najat belkacem", "nathalie kosciusko-morizet",
	"patrick buisson", "pierre moscovici", "président", "présidentielle",
	"présidentielle 2007", "présidentielle 2012", "présidentielle 2017",
	"primaires", "ps", "rama yade", "régionales", "rolex",
	"roselyne bachelot", "sarkozy", "ségolène royal", "sénat", "sénateur",
	"ump", "valérie trierweiler", "villepin", "woerth",
	"xavier darcos") ] = "Politique"

	# économie : banques, capitalisme, industrie
	y[ x %in% c("air france", "airbus", "banques", "bernard arnault", "bnp", "bourse",
	"capitalisme", "crédit lyonnais", "crise", "crise économique 2009",
	"crise financière 2008", "entreprises", "florange", "fmi", "krach", "lcl",
	"les firmes", "les riches", "medef", "parisot", "stock-options", "trader",
	"wall street") ] = "Capitalisme"

	# showbiz et assimilés
	y[ x %in% c("anne sinclair", "bigard", "carla bruni", "closer", "delarue",
	"dieudonné", "frigide barjot", "galliano", "gérard depardieu",
	"georges lautner", "houellebecq",
	"jean-pierre pernaut", "johnny hallyday", "madonna", "michael jackson",
	"mireille mathieu", "miss france", "polanski", "raymond domenech", "yannick noah",
	"zemmour") ] = "Showbiz"

	# politique internationale
	y[ x %in% c("afghanistan", "ahmadinejad", "algérie", "allemagne", "bachar al-assad",
	"belgique", "ben ali", "berlusconi", "brésil", "bush", "durban", "egypte",
	"gaza", "grèce", "haïti", "irak", "israël", "israël-palestine", "japon",
	"kadhafi", "les dictatures", "libye", "mali", "maroc", "merkel", "mitt romney",
	"moubarak", "nigeria", "norvège", "obama", "palestine", "poutine", "pussy riot", "qatar",
	"rio", "roumanie", "russie", "suisse", "syrie", "tunisie", "ukraine", "usa" ) ] = "International"

	# terrorisme
	y[ x %in% c("al-qaida", "11 septembre", "ben laden", "jihad", "mohamed merah", "otage",
	"terrorisme") ] = "Terrorisme"

	# religion
	y[ x %in% c("antisémitisme", "athées", "benoit xvi", "bible", "burka", "charia", "coran",
	"dieu", "évêque", "françois 1er", "intégrisme", "jesus", "laïcité", "lourdes",
	"mahomet", "opus dei", "pape", "prophète", "ramadan", "religion", "torah",
	"vatican", "voile") ] = "Religion"

	# armée et police
	y[ x %in% c("14 juillet", "armée", "armes", "crs", "police") ] = "Armée/Police"

	# violence
	y[ x %in% c("assassinat", "attentats", "discrimination", "guerre", "homophobie",
	"incendie criminel", "massacre", "parricide", "pédophilie", "sexisme",
	"torture", "tueur de masse", "violence", "xénophobie") ] = "Violence"

	# médias
	y[ x %in% c("cavanna", "le monde", "libération", "figaro", "france télévision", "hara-kiri",
	"radio", "radio france", "siné", "télé", "tf1") ] = "Medias"

	# residuals
	# 'bac' is the diploma, not the police unit
	# 'bizutage' is not necessarily violent
	y[ x %in% c("2022", "absentéisme", "agriculture", "alcoolisme",
	"alimentation", "amende", "anniversaire", "astérix", "aviation", "avion", "bac",
	"ballerine", "banditisme", "bateau", "batman", "bizutage", "bonne année", "bonus",
	"budget", "ça déraille", "carbone", "caricature", "catastrophes naturelles",
	"charter", "chômage", "cinéma", "climat", "clonage", "clowns",
	"cocaïne", "colonialisme", "coluche", "concept cars", "consommation",
	"corruption", "crash test", "croissance",
	"délation", "démocratie", "derrick", "dimanche", "disparus", "drh", "drogue",
	"ebola", "école", "écologie", "économie",
	"écoute", "education nationale", "élections", "elvis",
	"émigration", "enseignement", "environnement",
	"épidémie", "essence", "été", "europe", "examen", "expulsions",
	"facebook", "famille", "farine animale", "femen",
	"foot", "france", "france télécom",
	"gigolo", "glou glou", "grève", "grippe",
	"hitler", "humanisme", "immigration", "impôts", "intempéries", "internet",
	"iron man", "jeanne d'arc", "jo", "journal", "journalisme", "journée de...",
	"justice", "l'oréal",
	"la poste", "le cimetière", "livres", "logement", "lois",
	"malaise", "manifestations", "mariage",
	"mc donald's", "médicament", "mer", "mode",
	"nazisme", "négationnisme", "népotisme", "noël",
	"nucléaire", "nudisme", "ouverture",
	"patrimoine", "pétrole", "phallocrate", "pharmaceutique",
	"philosophie", "pip", "pirates", "pitbulls",
	"plage", "pma", "pollution", "poséidon adventure",
	"préservatif", "presse",
	"prisons", "privatisation", "prof", "prostitution", "pub",
	"recyclage", "réforme", "régime",
	"remaniement", "rentrée", "répression", "retraite",
	"révolution", "rigueur", "rumeur", "santé", "science",
	"sdf", "sécurité", "sécurité sociale",
	"sexualité", "smic", "social",
	"soeur emmanuelle", "sondages", "sotchi", "sport", "st valentin",
	"suicide", "surpopulation", "taser", "taxe", "téléphone", "téléthon",
	"tempête", "théâtre", "totalitarisme", "train", "travail", "tva", "twitter",
	"université", "vacances", "vache folle",
	"vie privée", "voeux", "voiture") ] = "Varia"

	# simplify
	y[ y %in% c("Showbiz", "Politique") ] = "France"
	y[ y %in% c("Capitalisme", "Armée/Police", "Medias") ] = "Varia"

	return(y)

	}