briatte/revues-data.r

## revues-data.r
data = "revues-soc.csv"

if(!file.exists(data)) {

  dir.create("rev", showWarnings = FALSE)
  dir.create("num", showWarnings = FALSE)

  r = "http://www.cairn.info/"

  download.file(paste0(r, "discipline.php?POS=11&TITRE=ALL"),
                "revues-soc.html", mode = "wb", quiet = TRUE)

  y = html("revues-soc.html")
  y = html_nodes(y, ".revue a") %>% html_attr("href") %>% unique

  for(i in y) {

    u = paste0(r, i)
    f = paste0("rev/", i)

    # first page
    cat("Parsing:", f, "\n")

    if(!file.exists(f))
      download.file(u, f, mode = "wb", quiet = TRUE)

    h = html(f)

    # possible additional pages
    p = html_nodes(h, ".pager .nb a") %>% html_attr("href")

    for(j in p) {

      u = paste0(r, j)
      f = gsub("\\.htm$", paste0("-", which(p == j) + 1, ".htm"), i)
      f = paste0("rev/", f)

      cat("Parsing:", f, "\n")

      if(!file.exists(f))
        download.file(u, f, mode = "wb", quiet = TRUE)

    }

  }

  d = data.frame()
  for(i in dir("rev", pattern = "^revue", full.names = TRUE)) {

    h = html(i)

    # issues
    n = h %>%
      html_nodes(".list_numeros a") %>%
      html_attr("href") %>%
      unique

    cat("\n", i, ":", length(n), "issue(s)\n")

    for(j in rev(gsub("\\s", "%20", n))) {

      cat(sprintf("%3.0f", which(n == j)), j)

      u = paste0(r, j)
      f = paste0("num/", j)

      # some issues are missing
      if(!file.exists(f))
        h = try(download.file(u, f, mode = "wb", quiet = TRUE), silent = TRUE)

      if("try-error" %in% class(h) | !file.info(f)$size) {

        cat(": failed\n")

      } else {

        h = html(f)

        # unique authors
        a = h %>%
          html_nodes(".list_articles .authors a") %>%
          html_attr("href") %>%
          unique

        # simplify URLs
        a = gsub("publications-de-|\\.htm$", "", a[ a != "" ])

        # unique articles
        l = str_trim(h %>% html_nodes(".list_articles .authors") %>% html_text())
        l = length(l[ l!= "" ])

        cat(":", l, "articles(s)", length(a), "author(s)\n")

        # add to dataset
        if(length(a))
          d = rbind(d, data.frame(numero = f,
                                  revue = gsub("num/revue-|-\\d{4}-(.*)\\.htm", "", f),
                                  annee = str_extract(f, "[0-9]{4}"),
                                  auteur = a,
                                  articles = l,
                                  stringsAsFactors = FALSE))

      }


    }

  }

  write.csv(d, data, row.names = FALSE)

}

d = read.csv(data, stringsAsFactors = FALSE)

table(d$annee)

sum = group_by(d, numero) %>% summarise(sum = unique(articles))

cat(n_distinct(d$revue), "journals",
    sum(sum$sum), "articles",
    n_distinct(d$numero), "issues",
    n_distinct(d$auteur), "authors\n")

## revues-hhi.r
# number of articles per author
n_a = group_by(d, auteur) %>%
  summarise(n_articles = n())

# author concentration index
h_a = d[, c("auteur", "revue") ] %>%
  group_by(auteur, revue) %>%
  # number of articles in each journal
  summarise(n = n()) %>%
  group_by(auteur) %>%
  # journal shares by author
  mutate(s = n / sum(n)) %>%
  # Herfindahl–Hirschman Index
  summarise(hhi = sum(s^2), n_revues = n())

# join
a = full_join(n_a, h_a) %>%
  arrange(hhi)

# HHI will vary between 1/14 and 1
summary(a$n_revues)

# example authors with tons of articles in just one journal
filter(a, hhi == 1 & n_articles > 50)

# concentration decreases with number of journals per author
qplot(data = a, y = hhi, x = factor(n_revues),
      geom = "boxplot") +
  labs(y = "Indice de Herfindahl-Hirschman\n",
       x = "\nNombre de revues par auteur") +
  theme_bw()

ggsave("revues_hhi.png", width = 10, height = 9)

# percentage of single-journal authors + unweighted and weighted journal HHI
# weighted HHI is weighted by inverse frequency of author by journal
h = full_join(d[, c("revue", "auteur") ] %>%
                group_by(revue, auteur) %>%
                mutate(weight = n()) %>%
                unique %>%
                group_by(revue) %>%
                mutate(weight = weight / sum(weight)),
              a[, c("auteur", "hhi", "n_revues") ]) %>%
                group_by(revue) %>%
                summarise(
                  n_auts = n(),
                  p_mono = sum(n_revues == 1) / n(),
                  wt_hhi = weighted.mean(hhi, weight),
                  mu_hhi = mean(hhi),
                  sd_hhi = sd(hhi),
                  min_hhi = min(hhi),
                  max_hhi = max(hhi)) %>%
                arrange(mu_hhi)

# weighting produces almost no correction
cor(h$mu_hhi, h$wt_hhi)

# ordered HHI pointranges, separated by quartiles
h$nq = cut(h$mu_hhi, quantile(h$mu_hhi), include.lowest = TRUE, dig.lab = 2)
qplot(data = h, x = reorder(revue, -mu_hhi),
      y = mu_hhi, ymin = mu_hhi - sd_hhi, ymax = mu_hhi + sd_hhi,
      color = nq,
      geom = "pointrange") +
  # geom_point(aes(y = wt_hhi), color = "grey") +
  coord_flip() +
  scale_color_brewer("Quartile", palette = "RdBu") +
  labs(x = NULL, y = "\nIndice de Herfindahl-Hirschman moyen ± 1 écart-type") +
  theme_bw() +
  theme(legend.position = "bottom")

ggsave("revues_hhi_rank.png", width = 10, height = 16)
ggsave("revues_hhi_rank.pdf", width = 10, height = 16)

# correlation between HHI and mono-journal author percentage
with(h, cor(p_mono, wt_hhi))
with(h, cor(p_mono, mu_hhi))

# ordered mono-journal author percentage, separated by quartiles
h$nq = cut(100 * h$p_mono, quantile(100 * h$p_mono), include.lowest = TRUE, dig.lab = 2)
qplot(data = h, x = reorder(revue, -p_mono),
      y = p_mono, color = nq) +
  coord_flip() +
  scale_color_brewer("Quartile", palette = "RdBu") +
  scale_y_continuous(label = percent_format()) +
  labs(x = NULL, y = "\nPourcentage d'auteurs ayant publié dans cette seule revue") +
  theme_bw() +
  theme(legend.position = "bottom")

ggsave("revues_mono_rank.png", width = 10, height = 16)
ggsave("revues_mono_rank.pdf", width = 10, height = 16)

# 'blind' percentage of editorial concentration, by journal

e = data.frame()
for(i in unique(d$revue)) {

  t = unique(d[ d$revue == i, "auteur" ])
  t = as.data.frame(table(d[ d$auteur %in% t, "revue" ]))
  t = 1 - (sum(t$Freq) - t$Freq[ t$Var1 == i ]) / sum(t$Freq)
  e = rbind(e, data.frame(revue = i, t, stringsAsFactors = FALSE))

}
e = full_join(h, arrange(e, -t))

# correlation to HHI and mono-journal author percentage
with(e, cor(p_mono, t))
with(e, cor(mu_hhi, t))

# ordered editorial concentration rate, separated by quartiles
e$nq = cut(100 * e$t, quantile(100 * e$t), include.lowest = TRUE, dig.lab = 2)
qplot(data = e, x = reorder(revue, -t),
      y = t, color = nq) +
  coord_flip() +
  scale_color_brewer("Quartile", palette = "RdBu") +
  scale_y_continuous(label = percent_format()) +
  labs(x = NULL, y = "\nTaux de concentration de la production écrite des auteurs") +
  theme_bw() +
  theme(legend.position = "bottom")

ggsave("revues_conc_rank.png", width = 10, height = 16)
ggsave("revues_conc_rank.pdf", width = 10, height = 16)

## revues-networks.r
library(animation)
library(network)
library(GGally)
library(sna)  # unweighted degree
library(tnet) # weighted degree

dir.create("plots", showWarnings = FALSE)
dir.create("csv", showWarnings = FALSE)

#
# shared authors between journals, weighted by articles
# (run revues-data.r first)
#

k = paste0("revue-", unique(d$revue))

for(j in k) {

  n = dir("num", pattern = paste0("^", j, "-\\d{4}"), full.names = TRUE)
  n = n[ file.info(n)$size > 0 ]

  f = paste0("csv/", j, ".csv")
  if(!file.exists(f)) {

    cat("\n", j, ":", length(n), "issue(s)\n")

    r = data.frame()
    for(i in rev(n)) {

      cat(sprintf("%3.0f", which(n == i)), i)

      h = html(i)

      # unique authors per article
      a = html_nodes(h, ".list_articles .authors")
      a = sapply(a, function(x) html_nodes(x, "a") %>%
                   html_attr("href") %>%
                   unique %>%
                   paste0(., collapse = ";"))

      # simplify URLs
      a = gsub("publications-de-|\\.htm", "", a[ a != "" ])
      cat(":", length(a), "articles(s)\n")

      # add to dataset
      if(length(a))
        r = rbind(r, data.frame(numero = i,
                                revue = gsub("num/revue-|-\\d{4}-(.*)\\.htm", "", i),
                                annee = str_extract(i, "[0-9]{4}"),
                                auteurs = a,
                                stringsAsFactors = FALSE))

    }

    write.csv(r, f, row.names = FALSE)

  }

}

r = dir("csv", pattern = "^revue-", full.names = TRUE)
r = lapply(r, read.csv, stringsAsFactors = FALSE)
r = bind_rows(r)

#
# journal-specific edge lists
#

a = strsplit(r$auteurs, ";")
e = data.frame()
for(i in unique(r$revue)) {

  y = unique(unlist(strsplit(r$auteurs[ r$revue == i ], ";")))
  cat(i, ":", length(y), "authors ")

  y = r[ which(sapply(a, function(x) any(x %in% y))), ]
  cat(nrow(y), "articles", n_distinct(y$revue), "journals\n")

  y = as.data.frame(table(y$revue))
  y$Freq = y$Freq / y$Freq[ y$Var1 == i ]
  y = y[ y$Var1 != i, ]

  e = rbind(e, data.frame(i, j = y$Var1, w = y$Freq, stringsAsFactors = FALSE))

}

#
# weighted edge list
#

n = apply(e[, 1:2 ], 1, function(x) paste0(sort(x), collapse = "///"))
n = data.frame(n, w = e$w)

e = aggregate(w ~ n, sum, data = n)
e = data.frame(i = gsub("(.*)///(.*)", "\\1", e$n),
               j = gsub("(.*)///(.*)", "\\2", e$n),
               w = e$w)

e = mutate(e, ecdf = cume_dist(w))

qplot(data = e, x = w, y = ecdf, color = I("grey")) +
  scale_x_log10() +
  labs(y = "Fréquence cumulée\n", x = "\nIntensité des liens (logarithme base 10)") +
  theme_bw()

ggsave("ecdf.png", width = 9, height = 9)

e = filter(e, w < 1) %>% mutate(ecdf = cume_dist(w))

plot_ecdf = function(x) {

  g = qplot(data = subset(e, i != x & j != x),
            x = w, y = ecdf, color = I("grey")) +
    geom_point(data = subset(e, (i == x | j == x)), color = "black") +
    geom_rug(data = subset(e, (i == x | j == x)), sides = "b") +
    scale_color_manual(values = c("TRUE" = "black", "FALSE" = "grey")) +
    scale_x_log10() +
    labs(y = "Fréquence cumulée\n", x = "\nIntensité des liens (logarithme base 10)") +
    guides(color = FALSE) +
    theme_bw()

  print(g)

}

plot_ecdf("geneses")
ggsave("ecdf_geneses.png", width = 9, height = 9)

plot_ecdf("societes")
plot_ecdf("sociologie")

#
# animated networks
#

plot_network <- function(x) {

  saveGIF({
    for(q in c(0, .1, .2, .3, .4, .5, .6, .7, .8, .9,
               .91, .92, .93, .94, .95, .96, .97, .98, .99)) {

      t = quantile(e$w, q)
      n = network(e[ e$w > t, 1:2 ], directed = FALSE)
      set.edge.attribute(n, "weight", e[ e$w > t, 3 ])

      n %v% "size" = 1 + as.numeric(cut(degree(n), unique(quantile(degree(n))), include.lowest = TRUE))
      n %v% "id" = ifelse(network.vertex.names(n) == x, "S", "A")

      colors = c("A" = "black", "S" = "red")
      colors = colors[ names(colors) %in% unique(n %v% "id") ]

      n %e% "alpha" = as.numeric(cut(n %e% "weight", unique(quantile(n %e% "weight")), include.lowest = TRUE))
      n %e% "alpha" = n %e% "alpha" / max(n %e% "alpha")

      g = ggnet(n, size = 3,
                node.group = n %v% "id", node.color = colors,
                segment.alpha = n %e% "alpha", segment.color = "grey33",
                label.size = 6) + # label.size = n %v% "size",
        ggtitle(paste("seuil =", q, "\n", network.size(n), "revues")) +
        guides(color = FALSE)

      print(g)

    }
  }, movie.name = paste0("network_", x, ".gif"))

}

plot_network("societes")
plot_network("sociologie")

#
# relative centrality at each threshold level
#

rel_degree <- function(x) {

  wd = data.frame()
  for(q in seq(0, .99, .01)) {

    t = quantile(e$w, q)
    t = e[ e$w > t, 1:3 ]
    t$i = as.character(t$i)
    t$j = as.character(t$j)
    l = unique(c(t$i, t$j))
    t$i = as.numeric(factor(t$i, levels = l))
    t$j = as.numeric(factor(t$j, levels = l))
    t = symmetrise_w(as.tnet(t, type = "weighted one-mode tnet"))
    y = which(l == x)
    if(!length(y))
      wd = rbind(wd, data.frame(x, q, reldegree = 0, stringsAsFactors = FALSE))
    else
      wd = rbind(wd, data.frame(x, q,
                                reldegree = degree_w(t)[ y, 2 ] / max(degree_w(t)[, 2]),
                                stringsAsFactors = FALSE))

  }
  wd

}

wd = data.frame()
j = unique(r$revue)
for(i in rev(j)) {

  cat(sprintf("%3.0f", which(j == i)), i, "\n")
  wd = rbind(wd, rel_degree(i))

}

qplot(data = wd, y = reldegree, x = q, group = x,
      alpha = I(.5), color = I("grey"), geom = "line") +
  geom_line(data = subset(wd, x %in% c("recherche-en-soins-infirmiers",
                                       "actes-de-la-recherche-en-sciences-sociales",
                                       "societes",
                                       "sociologie",
                                       "francaise-de-sociologie",
                                       "reseaux",
                                       "geneses")),
            aes(color = x), alpha = 1, size = 1) +
  scale_color_brewer("", palette = "Set1") +
  labs(y = "Degré de centralité relatif\n", x = "\nSeuil d'intensité des liens entre les revues") +
  theme_bw() +
  theme(panel.grid = element_blank())

ggsave("degree_all.png", width = 11, height = 8)

## revues.r
#
# load packages
#

library(dplyr)
library(ggplot2)
library(Hmisc) # wtd.mean and wtd.quantile
library(httr)
library(rvest)
library(scales)
library(stringr)

#
# load master data
#

source("revues-data.r")

#
# summary stats
#

# by journal:
# number of issues and unique authors
by_j = group_by(d, revue) %>%
  summarise(min = min(annee), max = max(annee),
            numeros = n_distinct(numero),
            auteurs = n_distinct(auteur)) %>%
  mutate(annees = as.numeric(max) - as.numeric(min) + 1) %>%
  arrange(-annees) #%>%
  # head(., 5)
  # tail(., 5)
  # comparable to 'Sociétés'
  # filter(annees %in% 12:16 & auteurs %in% 443:463)

# by journal issue:
# average number of authors and articles
by_i = group_by(d, numero) %>%
  mutate(auteurs = n_distinct(auteur)) %>%
  select(numero, revue, articles, auteurs) %>%
  unique %>%
  group_by(revue) %>%
  summarise(mu_articles = mean(articles), mu_auteurs = mean(auteurs))

#
# journals per author
#

dd = d %>%
  group_by(auteur) %>%
  mutate(n = n_distinct(revue)) %>%
  group_by(revue) %>%
  summarise(mu = mean(n), sd = sd(n), n = n()) %>%
  arrange(-mu) %>%
  mutate(rank = row_number(), rankp = percent_rank(mu))

# some example journals
dd$tagged = NA
dd$tagged[ dd$revue == "societes" ] = "Sociétés"
dd$tagged[ dd$revue == "sociologie" ] = "Sociologie"
dd$tagged[ dd$revue == "droit-et-societe" ] = "Droit et Société"
dd$tagged[ dd$revue == "reseaux" ] = "Réseaux"
dd$tagged[ dd$revue == "population" ] = "Population"

qplot(data = dd, y = rankp, x = mu, geom = "step") +
  geom_point(data = subset(dd, !is.na(tagged))) +
  geom_segment(data = subset(dd, !is.na(tagged)),
               aes(xend = mu + .125, yend = rankp), lty = "dotted") +
  geom_text(data = subset(dd, !is.na(tagged)),
            aes(x = mu + .15, label = tagged), hjust = 0, fontface = "italic") +
  xlim(1, 4.25) +
  scale_y_continuous(label = percent_format()) +
  labs(x = "\nNombre moyen de revues par auteur", y = "Fréquence cumulée\n") +
  theme_bw()

ggsave("revues_ecdf.png", width = 10, height = 9)

# more comparable journals
dd$tagged = NA
dd$tagged[ dd$revue == "societes" ] = "Sociétés"
dd$tagged[ dd$revue == "geneses" ] = "Genèses"
dd$tagged[ dd$revue == "cahiers-d-etudes-africaines" ] = "Cahiers d'études africaines"

qplot(data = dd, y = rankp, x = mu, geom = "step") +
  geom_point(data = subset(dd, !is.na(tagged))) +
  geom_segment(data = subset(dd, !is.na(tagged)),
               aes(xend = mu + .125, yend = rankp), lty = "dotted") +
  geom_text(data = subset(dd, !is.na(tagged)),
            aes(x = mu + .15, label = tagged), hjust = 0, fontface = "italic") +
  xlim(1, 4.25) +
  scale_y_continuous(label = percent_format()) +
  labs(x = "\nNombre moyen de revues par auteur", y = "Fréquence cumulée\n") +
  theme_bw()

ggsave("revues_comparables.png", width = 10, height = 9)

# sd increases with mean
qplot(data = dd, y = mu, x = sd) +
  labs(y = "Nombre moyen de revues par auteur\n", x = "\nÉcart-type") +
  geom_smooth(method = "loess", se = FALSE) +
  theme_bw()

ggsave("revues_mu_sd.png", width = 10, height = 9)

#
# 'Sociétés' journal
#

# other journals with same authors
t = unique(d[ d$revue == "societes", "auteur" ])
t = as.data.frame(table(d[ d$auteur %in% t, "revue" ]))
t = arrange(t, -Freq)
filter(t, Freq > 9)

# percentage of other journals
(sum(t$Freq) - t$Freq[1]) / sum(t$Freq)
	data = "revues-soc.csv"

	if(!file.exists(data)) {

	dir.create("rev", showWarnings = FALSE)
	dir.create("num", showWarnings = FALSE)

	r = "http://www.cairn.info/"

	download.file(paste0(r, "discipline.php?POS=11&TITRE=ALL"),
	"revues-soc.html", mode = "wb", quiet = TRUE)

	y = html("revues-soc.html")
	y = html_nodes(y, ".revue a") %>% html_attr("href") %>% unique

	for(i in y) {

	u = paste0(r, i)
	f = paste0("rev/", i)

	# first page
	cat("Parsing:", f, "\n")

	if(!file.exists(f))
	download.file(u, f, mode = "wb", quiet = TRUE)

	h = html(f)

	# possible additional pages
	p = html_nodes(h, ".pager .nb a") %>% html_attr("href")

	for(j in p) {

	u = paste0(r, j)
	f = gsub("\\.htm$", paste0("-", which(p == j) + 1, ".htm"), i)
	f = paste0("rev/", f)

	cat("Parsing:", f, "\n")

	if(!file.exists(f))
	download.file(u, f, mode = "wb", quiet = TRUE)

	}

	}

	d = data.frame()
	for(i in dir("rev", pattern = "^revue", full.names = TRUE)) {

	h = html(i)

	# issues
	n = h %>%
	html_nodes(".list_numeros a") %>%
	html_attr("href") %>%
	unique

	cat("\n", i, ":", length(n), "issue(s)\n")

	for(j in rev(gsub("\\s", "%20", n))) {

	cat(sprintf("%3.0f", which(n == j)), j)

	u = paste0(r, j)
	f = paste0("num/", j)

	# some issues are missing
	if(!file.exists(f))
	h = try(download.file(u, f, mode = "wb", quiet = TRUE), silent = TRUE)

	if("try-error" %in% class(h) \| !file.info(f)$size) {

	cat(": failed\n")

	} else {

	h = html(f)

	# unique authors
	a = h %>%
	html_nodes(".list_articles .authors a") %>%
	html_attr("href") %>%
	unique

	# simplify URLs
	a = gsub("publications-de-\|\\.htm$", "", a[ a != "" ])

	# unique articles
	l = str_trim(h %>% html_nodes(".list_articles .authors") %>% html_text())
	l = length(l[ l!= "" ])

	cat(":", l, "articles(s)", length(a), "author(s)\n")

	# add to dataset
	if(length(a))
	d = rbind(d, data.frame(numero = f,
	revue = gsub("num/revue-\|-\\d{4}-(.*)\\.htm", "", f),
	annee = str_extract(f, "[0-9]{4}"),
	auteur = a,
	articles = l,
	stringsAsFactors = FALSE))

	}


	}

	}

	write.csv(d, data, row.names = FALSE)

	}

	d = read.csv(data, stringsAsFactors = FALSE)

	table(d$annee)

	sum = group_by(d, numero) %>% summarise(sum = unique(articles))

	cat(n_distinct(d$revue), "journals",
	sum(sum$sum), "articles",
	n_distinct(d$numero), "issues",
	n_distinct(d$auteur), "authors\n")
	# number of articles per author
	n_a = group_by(d, auteur) %>%
	summarise(n_articles = n())

	# author concentration index
	h_a = d[, c("auteur", "revue") ] %>%
	group_by(auteur, revue) %>%
	# number of articles in each journal
	summarise(n = n()) %>%
	group_by(auteur) %>%
	# journal shares by author
	mutate(s = n / sum(n)) %>%
	# Herfindahl–Hirschman Index
	summarise(hhi = sum(s^2), n_revues = n())

	# join
	a = full_join(n_a, h_a) %>%
	arrange(hhi)

	# HHI will vary between 1/14 and 1
	summary(a$n_revues)

	# example authors with tons of articles in just one journal
	filter(a, hhi == 1 & n_articles > 50)

	# concentration decreases with number of journals per author
	qplot(data = a, y = hhi, x = factor(n_revues),
	geom = "boxplot") +
	labs(y = "Indice de Herfindahl-Hirschman\n",
	x = "\nNombre de revues par auteur") +
	theme_bw()

	ggsave("revues_hhi.png", width = 10, height = 9)

	# percentage of single-journal authors + unweighted and weighted journal HHI
	# weighted HHI is weighted by inverse frequency of author by journal
	h = full_join(d[, c("revue", "auteur") ] %>%
	group_by(revue, auteur) %>%
	mutate(weight = n()) %>%
	unique %>%
	group_by(revue) %>%
	mutate(weight = weight / sum(weight)),
	a[, c("auteur", "hhi", "n_revues") ]) %>%
	group_by(revue) %>%
	summarise(
	n_auts = n(),
	p_mono = sum(n_revues == 1) / n(),
	wt_hhi = weighted.mean(hhi, weight),
	mu_hhi = mean(hhi),
	sd_hhi = sd(hhi),
	min_hhi = min(hhi),
	max_hhi = max(hhi)) %>%
	arrange(mu_hhi)

	# weighting produces almost no correction
	cor(h$mu_hhi, h$wt_hhi)

	# ordered HHI pointranges, separated by quartiles
	h$nq = cut(h$mu_hhi, quantile(h$mu_hhi), include.lowest = TRUE, dig.lab = 2)
	qplot(data = h, x = reorder(revue, -mu_hhi),
	y = mu_hhi, ymin = mu_hhi - sd_hhi, ymax = mu_hhi + sd_hhi,
	color = nq,
	geom = "pointrange") +
	# geom_point(aes(y = wt_hhi), color = "grey") +
	coord_flip() +
	scale_color_brewer("Quartile", palette = "RdBu") +
	labs(x = NULL, y = "\nIndice de Herfindahl-Hirschman moyen ± 1 écart-type") +
	theme_bw() +
	theme(legend.position = "bottom")

	ggsave("revues_hhi_rank.png", width = 10, height = 16)
	ggsave("revues_hhi_rank.pdf", width = 10, height = 16)

	# correlation between HHI and mono-journal author percentage
	with(h, cor(p_mono, wt_hhi))
	with(h, cor(p_mono, mu_hhi))

	# ordered mono-journal author percentage, separated by quartiles
	h$nq = cut(100 * h$p_mono, quantile(100 * h$p_mono), include.lowest = TRUE, dig.lab = 2)
	qplot(data = h, x = reorder(revue, -p_mono),
	y = p_mono, color = nq) +
	coord_flip() +
	scale_color_brewer("Quartile", palette = "RdBu") +
	scale_y_continuous(label = percent_format()) +
	labs(x = NULL, y = "\nPourcentage d'auteurs ayant publié dans cette seule revue") +
	theme_bw() +
	theme(legend.position = "bottom")

	ggsave("revues_mono_rank.png", width = 10, height = 16)
	ggsave("revues_mono_rank.pdf", width = 10, height = 16)

	# 'blind' percentage of editorial concentration, by journal

	e = data.frame()
	for(i in unique(d$revue)) {

	t = unique(d[ d$revue == i, "auteur" ])
	t = as.data.frame(table(d[ d$auteur %in% t, "revue" ]))
	t = 1 - (sum(t$Freq) - t$Freq[ t$Var1 == i ]) / sum(t$Freq)
	e = rbind(e, data.frame(revue = i, t, stringsAsFactors = FALSE))

	}
	e = full_join(h, arrange(e, -t))

	# correlation to HHI and mono-journal author percentage
	with(e, cor(p_mono, t))
	with(e, cor(mu_hhi, t))

	# ordered editorial concentration rate, separated by quartiles
	e$nq = cut(100 * e$t, quantile(100 * e$t), include.lowest = TRUE, dig.lab = 2)
	qplot(data = e, x = reorder(revue, -t),
	y = t, color = nq) +
	coord_flip() +
	scale_color_brewer("Quartile", palette = "RdBu") +
	scale_y_continuous(label = percent_format()) +
	labs(x = NULL, y = "\nTaux de concentration de la production écrite des auteurs") +
	theme_bw() +
	theme(legend.position = "bottom")

	ggsave("revues_conc_rank.png", width = 10, height = 16)
	ggsave("revues_conc_rank.pdf", width = 10, height = 16)
	library(animation)
	library(network)
	library(GGally)
	library(sna) # unweighted degree
	library(tnet) # weighted degree

	dir.create("plots", showWarnings = FALSE)
	dir.create("csv", showWarnings = FALSE)

	#
	# shared authors between journals, weighted by articles
	# (run revues-data.r first)
	#

	k = paste0("revue-", unique(d$revue))

	for(j in k) {

	n = dir("num", pattern = paste0("^", j, "-\\d{4}"), full.names = TRUE)
	n = n[ file.info(n)$size > 0 ]

	f = paste0("csv/", j, ".csv")
	if(!file.exists(f)) {

	cat("\n", j, ":", length(n), "issue(s)\n")

	r = data.frame()
	for(i in rev(n)) {

	cat(sprintf("%3.0f", which(n == i)), i)

	h = html(i)

	# unique authors per article
	a = html_nodes(h, ".list_articles .authors")
	a = sapply(a, function(x) html_nodes(x, "a") %>%
	html_attr("href") %>%
	unique %>%
	paste0(., collapse = ";"))

	# simplify URLs
	a = gsub("publications-de-\|\\.htm", "", a[ a != "" ])
	cat(":", length(a), "articles(s)\n")

	# add to dataset
	if(length(a))
	r = rbind(r, data.frame(numero = i,
	revue = gsub("num/revue-\|-\\d{4}-(.*)\\.htm", "", i),
	annee = str_extract(i, "[0-9]{4}"),
	auteurs = a,
	stringsAsFactors = FALSE))

	}

	write.csv(r, f, row.names = FALSE)

	}

	}

	r = dir("csv", pattern = "^revue-", full.names = TRUE)
	r = lapply(r, read.csv, stringsAsFactors = FALSE)
	r = bind_rows(r)

	#
	# journal-specific edge lists
	#

	a = strsplit(r$auteurs, ";")
	e = data.frame()
	for(i in unique(r$revue)) {

	y = unique(unlist(strsplit(r$auteurs[ r$revue == i ], ";")))
	cat(i, ":", length(y), "authors ")

	y = r[ which(sapply(a, function(x) any(x %in% y))), ]
	cat(nrow(y), "articles", n_distinct(y$revue), "journals\n")

	y = as.data.frame(table(y$revue))
	y$Freq = y$Freq / y$Freq[ y$Var1 == i ]
	y = y[ y$Var1 != i, ]

	e = rbind(e, data.frame(i, j = y$Var1, w = y$Freq, stringsAsFactors = FALSE))

	}

	#
	# weighted edge list
	#

	n = apply(e[, 1:2 ], 1, function(x) paste0(sort(x), collapse = "///"))
	n = data.frame(n, w = e$w)

	e = aggregate(w ~ n, sum, data = n)
	e = data.frame(i = gsub("(.)///(.)", "\\1", e$n),
	j = gsub("(.)///(.)", "\\2", e$n),
	w = e$w)

	e = mutate(e, ecdf = cume_dist(w))

	qplot(data = e, x = w, y = ecdf, color = I("grey")) +
	scale_x_log10() +
	labs(y = "Fréquence cumulée\n", x = "\nIntensité des liens (logarithme base 10)") +
	theme_bw()

	ggsave("ecdf.png", width = 9, height = 9)

	e = filter(e, w < 1) %>% mutate(ecdf = cume_dist(w))

	plot_ecdf = function(x) {

	g = qplot(data = subset(e, i != x & j != x),
	x = w, y = ecdf, color = I("grey")) +
	geom_point(data = subset(e, (i == x \| j == x)), color = "black") +
	geom_rug(data = subset(e, (i == x \| j == x)), sides = "b") +
	scale_color_manual(values = c("TRUE" = "black", "FALSE" = "grey")) +
	scale_x_log10() +
	labs(y = "Fréquence cumulée\n", x = "\nIntensité des liens (logarithme base 10)") +
	guides(color = FALSE) +
	theme_bw()

	print(g)

	}

	plot_ecdf("geneses")
	ggsave("ecdf_geneses.png", width = 9, height = 9)

	plot_ecdf("societes")
	plot_ecdf("sociologie")

	#
	# animated networks
	#

	plot_network <- function(x) {

	saveGIF({
	for(q in c(0, .1, .2, .3, .4, .5, .6, .7, .8, .9,
	.91, .92, .93, .94, .95, .96, .97, .98, .99)) {

	t = quantile(e$w, q)
	n = network(e[ e$w > t, 1:2 ], directed = FALSE)
	set.edge.attribute(n, "weight", e[ e$w > t, 3 ])

	n %v% "size" = 1 + as.numeric(cut(degree(n), unique(quantile(degree(n))), include.lowest = TRUE))
	n %v% "id" = ifelse(network.vertex.names(n) == x, "S", "A")

	colors = c("A" = "black", "S" = "red")
	colors = colors[ names(colors) %in% unique(n %v% "id") ]

	n %e% "alpha" = as.numeric(cut(n %e% "weight", unique(quantile(n %e% "weight")), include.lowest = TRUE))
	n %e% "alpha" = n %e% "alpha" / max(n %e% "alpha")

	g = ggnet(n, size = 3,
	node.group = n %v% "id", node.color = colors,
	segment.alpha = n %e% "alpha", segment.color = "grey33",
	label.size = 6) + # label.size = n %v% "size",
	ggtitle(paste("seuil =", q, "\n", network.size(n), "revues")) +
	guides(color = FALSE)

	print(g)

	}
	}, movie.name = paste0("network_", x, ".gif"))

	}

	plot_network("societes")
	plot_network("sociologie")

	#
	# relative centrality at each threshold level
	#

	rel_degree <- function(x) {

	wd = data.frame()
	for(q in seq(0, .99, .01)) {

	t = quantile(e$w, q)
	t = e[ e$w > t, 1:3 ]
	t$i = as.character(t$i)
	t$j = as.character(t$j)
	l = unique(c(t$i, t$j))
	t$i = as.numeric(factor(t$i, levels = l))
	t$j = as.numeric(factor(t$j, levels = l))
	t = symmetrise_w(as.tnet(t, type = "weighted one-mode tnet"))
	y = which(l == x)
	if(!length(y))
	wd = rbind(wd, data.frame(x, q, reldegree = 0, stringsAsFactors = FALSE))
	else
	wd = rbind(wd, data.frame(x, q,
	reldegree = degree_w(t)[ y, 2 ] / max(degree_w(t)[, 2]),
	stringsAsFactors = FALSE))

	}
	wd

	}

	wd = data.frame()
	j = unique(r$revue)
	for(i in rev(j)) {

	cat(sprintf("%3.0f", which(j == i)), i, "\n")
	wd = rbind(wd, rel_degree(i))

	}

	qplot(data = wd, y = reldegree, x = q, group = x,
	alpha = I(.5), color = I("grey"), geom = "line") +
	geom_line(data = subset(wd, x %in% c("recherche-en-soins-infirmiers",
	"actes-de-la-recherche-en-sciences-sociales",
	"societes",
	"sociologie",
	"francaise-de-sociologie",
	"reseaux",
	"geneses")),
	aes(color = x), alpha = 1, size = 1) +
	scale_color_brewer("", palette = "Set1") +
	labs(y = "Degré de centralité relatif\n", x = "\nSeuil d'intensité des liens entre les revues") +
	theme_bw() +
	theme(panel.grid = element_blank())

	ggsave("degree_all.png", width = 11, height = 8)
	#
	# load packages
	#

	library(dplyr)
	library(ggplot2)
	library(Hmisc) # wtd.mean and wtd.quantile
	library(httr)
	library(rvest)
	library(scales)
	library(stringr)

	#
	# load master data
	#

	source("revues-data.r")

	#
	# summary stats
	#

	# by journal:
	# number of issues and unique authors
	by_j = group_by(d, revue) %>%
	summarise(min = min(annee), max = max(annee),
	numeros = n_distinct(numero),
	auteurs = n_distinct(auteur)) %>%
	mutate(annees = as.numeric(max) - as.numeric(min) + 1) %>%
	arrange(-annees) #%>%
	# head(., 5)
	# tail(., 5)
	# comparable to 'Sociétés'
	# filter(annees %in% 12:16 & auteurs %in% 443:463)

	# by journal issue:
	# average number of authors and articles
	by_i = group_by(d, numero) %>%
	mutate(auteurs = n_distinct(auteur)) %>%
	select(numero, revue, articles, auteurs) %>%
	unique %>%
	group_by(revue) %>%
	summarise(mu_articles = mean(articles), mu_auteurs = mean(auteurs))

	#
	# journals per author
	#

	dd = d %>%
	group_by(auteur) %>%
	mutate(n = n_distinct(revue)) %>%
	group_by(revue) %>%
	summarise(mu = mean(n), sd = sd(n), n = n()) %>%
	arrange(-mu) %>%
	mutate(rank = row_number(), rankp = percent_rank(mu))

	# some example journals
	dd$tagged = NA
	dd$tagged[ dd$revue == "societes" ] = "Sociétés"
	dd$tagged[ dd$revue == "sociologie" ] = "Sociologie"
	dd$tagged[ dd$revue == "droit-et-societe" ] = "Droit et Société"
	dd$tagged[ dd$revue == "reseaux" ] = "Réseaux"
	dd$tagged[ dd$revue == "population" ] = "Population"

	qplot(data = dd, y = rankp, x = mu, geom = "step") +
	geom_point(data = subset(dd, !is.na(tagged))) +
	geom_segment(data = subset(dd, !is.na(tagged)),
	aes(xend = mu + .125, yend = rankp), lty = "dotted") +
	geom_text(data = subset(dd, !is.na(tagged)),
	aes(x = mu + .15, label = tagged), hjust = 0, fontface = "italic") +
	xlim(1, 4.25) +
	scale_y_continuous(label = percent_format()) +
	labs(x = "\nNombre moyen de revues par auteur", y = "Fréquence cumulée\n") +
	theme_bw()

	ggsave("revues_ecdf.png", width = 10, height = 9)

	# more comparable journals
	dd$tagged = NA
	dd$tagged[ dd$revue == "societes" ] = "Sociétés"
	dd$tagged[ dd$revue == "geneses" ] = "Genèses"
	dd$tagged[ dd$revue == "cahiers-d-etudes-africaines" ] = "Cahiers d'études africaines"

	qplot(data = dd, y = rankp, x = mu, geom = "step") +
	geom_point(data = subset(dd, !is.na(tagged))) +
	geom_segment(data = subset(dd, !is.na(tagged)),
	aes(xend = mu + .125, yend = rankp), lty = "dotted") +
	geom_text(data = subset(dd, !is.na(tagged)),
	aes(x = mu + .15, label = tagged), hjust = 0, fontface = "italic") +
	xlim(1, 4.25) +
	scale_y_continuous(label = percent_format()) +
	labs(x = "\nNombre moyen de revues par auteur", y = "Fréquence cumulée\n") +
	theme_bw()

	ggsave("revues_comparables.png", width = 10, height = 9)

	# sd increases with mean
	qplot(data = dd, y = mu, x = sd) +
	labs(y = "Nombre moyen de revues par auteur\n", x = "\nÉcart-type") +
	geom_smooth(method = "loess", se = FALSE) +
	theme_bw()

	ggsave("revues_mu_sd.png", width = 10, height = 9)

	#
	# 'Sociétés' journal
	#

	# other journals with same authors
	t = unique(d[ d$revue == "societes", "auteur" ])
	t = as.data.frame(table(d[ d$auteur %in% t, "revue" ]))
	t = arrange(t, -Freq)
	filter(t, Freq > 9)

	# percentage of other journals
	(sum(t$Freq) - t$Freq[1]) / sum(t$Freq)