Skip to content

Instantly share code, notes, and snippets.

@briatte
Last active August 29, 2015 14:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save briatte/0bc4065d962d816caef3 to your computer and use it in GitHub Desktop.
Save briatte/0bc4065d962d816caef3 to your computer and use it in GitHub Desktop.
scrape author metadata from 'sociology' journals on cairn.info – start with revues.r
data = "revues-soc.csv"
if(!file.exists(data)) {
dir.create("rev", showWarnings = FALSE)
dir.create("num", showWarnings = FALSE)
r = "http://www.cairn.info/"
download.file(paste0(r, "discipline.php?POS=11&TITRE=ALL"),
"revues-soc.html", mode = "wb", quiet = TRUE)
y = html("revues-soc.html")
y = html_nodes(y, ".revue a") %>% html_attr("href") %>% unique
for(i in y) {
u = paste0(r, i)
f = paste0("rev/", i)
# first page
cat("Parsing:", f, "\n")
if(!file.exists(f))
download.file(u, f, mode = "wb", quiet = TRUE)
h = html(f)
# possible additional pages
p = html_nodes(h, ".pager .nb a") %>% html_attr("href")
for(j in p) {
u = paste0(r, j)
f = gsub("\\.htm$", paste0("-", which(p == j) + 1, ".htm"), i)
f = paste0("rev/", f)
cat("Parsing:", f, "\n")
if(!file.exists(f))
download.file(u, f, mode = "wb", quiet = TRUE)
}
}
d = data.frame()
for(i in dir("rev", pattern = "^revue", full.names = TRUE)) {
h = html(i)
# issues
n = h %>%
html_nodes(".list_numeros a") %>%
html_attr("href") %>%
unique
cat("\n", i, ":", length(n), "issue(s)\n")
for(j in rev(gsub("\\s", "%20", n))) {
cat(sprintf("%3.0f", which(n == j)), j)
u = paste0(r, j)
f = paste0("num/", j)
# some issues are missing
if(!file.exists(f))
h = try(download.file(u, f, mode = "wb", quiet = TRUE), silent = TRUE)
if("try-error" %in% class(h) | !file.info(f)$size) {
cat(": failed\n")
} else {
h = html(f)
# unique authors
a = h %>%
html_nodes(".list_articles .authors a") %>%
html_attr("href") %>%
unique
# simplify URLs
a = gsub("publications-de-|\\.htm$", "", a[ a != "" ])
# unique articles
l = str_trim(h %>% html_nodes(".list_articles .authors") %>% html_text())
l = length(l[ l!= "" ])
cat(":", l, "articles(s)", length(a), "author(s)\n")
# add to dataset
if(length(a))
d = rbind(d, data.frame(numero = f,
revue = gsub("num/revue-|-\\d{4}-(.*)\\.htm", "", f),
annee = str_extract(f, "[0-9]{4}"),
auteur = a,
articles = l,
stringsAsFactors = FALSE))
}
}
}
write.csv(d, data, row.names = FALSE)
}
d = read.csv(data, stringsAsFactors = FALSE)
table(d$annee)
sum = group_by(d, numero) %>% summarise(sum = unique(articles))
cat(n_distinct(d$revue), "journals",
sum(sum$sum), "articles",
n_distinct(d$numero), "issues",
n_distinct(d$auteur), "authors\n")
# number of articles per author
n_a = group_by(d, auteur) %>%
summarise(n_articles = n())
# author concentration index
h_a = d[, c("auteur", "revue") ] %>%
group_by(auteur, revue) %>%
# number of articles in each journal
summarise(n = n()) %>%
group_by(auteur) %>%
# journal shares by author
mutate(s = n / sum(n)) %>%
# Herfindahl–Hirschman Index
summarise(hhi = sum(s^2), n_revues = n())
# join
a = full_join(n_a, h_a) %>%
arrange(hhi)
# HHI will vary between 1/14 and 1
summary(a$n_revues)
# example authors with tons of articles in just one journal
filter(a, hhi == 1 & n_articles > 50)
# concentration decreases with number of journals per author
qplot(data = a, y = hhi, x = factor(n_revues),
geom = "boxplot") +
labs(y = "Indice de Herfindahl-Hirschman\n",
x = "\nNombre de revues par auteur") +
theme_bw()
ggsave("revues_hhi.png", width = 10, height = 9)
# percentage of single-journal authors + unweighted and weighted journal HHI
# weighted HHI is weighted by inverse frequency of author by journal
h = full_join(d[, c("revue", "auteur") ] %>%
group_by(revue, auteur) %>%
mutate(weight = n()) %>%
unique %>%
group_by(revue) %>%
mutate(weight = weight / sum(weight)),
a[, c("auteur", "hhi", "n_revues") ]) %>%
group_by(revue) %>%
summarise(
n_auts = n(),
p_mono = sum(n_revues == 1) / n(),
wt_hhi = weighted.mean(hhi, weight),
mu_hhi = mean(hhi),
sd_hhi = sd(hhi),
min_hhi = min(hhi),
max_hhi = max(hhi)) %>%
arrange(mu_hhi)
# weighting produces almost no correction
cor(h$mu_hhi, h$wt_hhi)
# ordered HHI pointranges, separated by quartiles
h$nq = cut(h$mu_hhi, quantile(h$mu_hhi), include.lowest = TRUE, dig.lab = 2)
qplot(data = h, x = reorder(revue, -mu_hhi),
y = mu_hhi, ymin = mu_hhi - sd_hhi, ymax = mu_hhi + sd_hhi,
color = nq,
geom = "pointrange") +
# geom_point(aes(y = wt_hhi), color = "grey") +
coord_flip() +
scale_color_brewer("Quartile", palette = "RdBu") +
labs(x = NULL, y = "\nIndice de Herfindahl-Hirschman moyen ± 1 écart-type") +
theme_bw() +
theme(legend.position = "bottom")
ggsave("revues_hhi_rank.png", width = 10, height = 16)
ggsave("revues_hhi_rank.pdf", width = 10, height = 16)
# correlation between HHI and mono-journal author percentage
with(h, cor(p_mono, wt_hhi))
with(h, cor(p_mono, mu_hhi))
# ordered mono-journal author percentage, separated by quartiles
h$nq = cut(100 * h$p_mono, quantile(100 * h$p_mono), include.lowest = TRUE, dig.lab = 2)
qplot(data = h, x = reorder(revue, -p_mono),
y = p_mono, color = nq) +
coord_flip() +
scale_color_brewer("Quartile", palette = "RdBu") +
scale_y_continuous(label = percent_format()) +
labs(x = NULL, y = "\nPourcentage d'auteurs ayant publié dans cette seule revue") +
theme_bw() +
theme(legend.position = "bottom")
ggsave("revues_mono_rank.png", width = 10, height = 16)
ggsave("revues_mono_rank.pdf", width = 10, height = 16)
# 'blind' percentage of editorial concentration, by journal
e = data.frame()
for(i in unique(d$revue)) {
t = unique(d[ d$revue == i, "auteur" ])
t = as.data.frame(table(d[ d$auteur %in% t, "revue" ]))
t = 1 - (sum(t$Freq) - t$Freq[ t$Var1 == i ]) / sum(t$Freq)
e = rbind(e, data.frame(revue = i, t, stringsAsFactors = FALSE))
}
e = full_join(h, arrange(e, -t))
# correlation to HHI and mono-journal author percentage
with(e, cor(p_mono, t))
with(e, cor(mu_hhi, t))
# ordered editorial concentration rate, separated by quartiles
e$nq = cut(100 * e$t, quantile(100 * e$t), include.lowest = TRUE, dig.lab = 2)
qplot(data = e, x = reorder(revue, -t),
y = t, color = nq) +
coord_flip() +
scale_color_brewer("Quartile", palette = "RdBu") +
scale_y_continuous(label = percent_format()) +
labs(x = NULL, y = "\nTaux de concentration de la production écrite des auteurs") +
theme_bw() +
theme(legend.position = "bottom")
ggsave("revues_conc_rank.png", width = 10, height = 16)
ggsave("revues_conc_rank.pdf", width = 10, height = 16)
library(animation)
library(network)
library(GGally)
library(sna) # unweighted degree
library(tnet) # weighted degree
dir.create("plots", showWarnings = FALSE)
dir.create("csv", showWarnings = FALSE)
#
# shared authors between journals, weighted by articles
# (run revues-data.r first)
#
k = paste0("revue-", unique(d$revue))
for(j in k) {
n = dir("num", pattern = paste0("^", j, "-\\d{4}"), full.names = TRUE)
n = n[ file.info(n)$size > 0 ]
f = paste0("csv/", j, ".csv")
if(!file.exists(f)) {
cat("\n", j, ":", length(n), "issue(s)\n")
r = data.frame()
for(i in rev(n)) {
cat(sprintf("%3.0f", which(n == i)), i)
h = html(i)
# unique authors per article
a = html_nodes(h, ".list_articles .authors")
a = sapply(a, function(x) html_nodes(x, "a") %>%
html_attr("href") %>%
unique %>%
paste0(., collapse = ";"))
# simplify URLs
a = gsub("publications-de-|\\.htm", "", a[ a != "" ])
cat(":", length(a), "articles(s)\n")
# add to dataset
if(length(a))
r = rbind(r, data.frame(numero = i,
revue = gsub("num/revue-|-\\d{4}-(.*)\\.htm", "", i),
annee = str_extract(i, "[0-9]{4}"),
auteurs = a,
stringsAsFactors = FALSE))
}
write.csv(r, f, row.names = FALSE)
}
}
r = dir("csv", pattern = "^revue-", full.names = TRUE)
r = lapply(r, read.csv, stringsAsFactors = FALSE)
r = bind_rows(r)
#
# journal-specific edge lists
#
a = strsplit(r$auteurs, ";")
e = data.frame()
for(i in unique(r$revue)) {
y = unique(unlist(strsplit(r$auteurs[ r$revue == i ], ";")))
cat(i, ":", length(y), "authors ")
y = r[ which(sapply(a, function(x) any(x %in% y))), ]
cat(nrow(y), "articles", n_distinct(y$revue), "journals\n")
y = as.data.frame(table(y$revue))
y$Freq = y$Freq / y$Freq[ y$Var1 == i ]
y = y[ y$Var1 != i, ]
e = rbind(e, data.frame(i, j = y$Var1, w = y$Freq, stringsAsFactors = FALSE))
}
#
# weighted edge list
#
n = apply(e[, 1:2 ], 1, function(x) paste0(sort(x), collapse = "///"))
n = data.frame(n, w = e$w)
e = aggregate(w ~ n, sum, data = n)
e = data.frame(i = gsub("(.*)///(.*)", "\\1", e$n),
j = gsub("(.*)///(.*)", "\\2", e$n),
w = e$w)
e = mutate(e, ecdf = cume_dist(w))
qplot(data = e, x = w, y = ecdf, color = I("grey")) +
scale_x_log10() +
labs(y = "Fréquence cumulée\n", x = "\nIntensité des liens (logarithme base 10)") +
theme_bw()
ggsave("ecdf.png", width = 9, height = 9)
e = filter(e, w < 1) %>% mutate(ecdf = cume_dist(w))
plot_ecdf = function(x) {
g = qplot(data = subset(e, i != x & j != x),
x = w, y = ecdf, color = I("grey")) +
geom_point(data = subset(e, (i == x | j == x)), color = "black") +
geom_rug(data = subset(e, (i == x | j == x)), sides = "b") +
scale_color_manual(values = c("TRUE" = "black", "FALSE" = "grey")) +
scale_x_log10() +
labs(y = "Fréquence cumulée\n", x = "\nIntensité des liens (logarithme base 10)") +
guides(color = FALSE) +
theme_bw()
print(g)
}
plot_ecdf("geneses")
ggsave("ecdf_geneses.png", width = 9, height = 9)
plot_ecdf("societes")
plot_ecdf("sociologie")
#
# animated networks
#
plot_network <- function(x) {
saveGIF({
for(q in c(0, .1, .2, .3, .4, .5, .6, .7, .8, .9,
.91, .92, .93, .94, .95, .96, .97, .98, .99)) {
t = quantile(e$w, q)
n = network(e[ e$w > t, 1:2 ], directed = FALSE)
set.edge.attribute(n, "weight", e[ e$w > t, 3 ])
n %v% "size" = 1 + as.numeric(cut(degree(n), unique(quantile(degree(n))), include.lowest = TRUE))
n %v% "id" = ifelse(network.vertex.names(n) == x, "S", "A")
colors = c("A" = "black", "S" = "red")
colors = colors[ names(colors) %in% unique(n %v% "id") ]
n %e% "alpha" = as.numeric(cut(n %e% "weight", unique(quantile(n %e% "weight")), include.lowest = TRUE))
n %e% "alpha" = n %e% "alpha" / max(n %e% "alpha")
g = ggnet(n, size = 3,
node.group = n %v% "id", node.color = colors,
segment.alpha = n %e% "alpha", segment.color = "grey33",
label.size = 6) + # label.size = n %v% "size",
ggtitle(paste("seuil =", q, "\n", network.size(n), "revues")) +
guides(color = FALSE)
print(g)
}
}, movie.name = paste0("network_", x, ".gif"))
}
plot_network("societes")
plot_network("sociologie")
#
# relative centrality at each threshold level
#
rel_degree <- function(x) {
wd = data.frame()
for(q in seq(0, .99, .01)) {
t = quantile(e$w, q)
t = e[ e$w > t, 1:3 ]
t$i = as.character(t$i)
t$j = as.character(t$j)
l = unique(c(t$i, t$j))
t$i = as.numeric(factor(t$i, levels = l))
t$j = as.numeric(factor(t$j, levels = l))
t = symmetrise_w(as.tnet(t, type = "weighted one-mode tnet"))
y = which(l == x)
if(!length(y))
wd = rbind(wd, data.frame(x, q, reldegree = 0, stringsAsFactors = FALSE))
else
wd = rbind(wd, data.frame(x, q,
reldegree = degree_w(t)[ y, 2 ] / max(degree_w(t)[, 2]),
stringsAsFactors = FALSE))
}
wd
}
wd = data.frame()
j = unique(r$revue)
for(i in rev(j)) {
cat(sprintf("%3.0f", which(j == i)), i, "\n")
wd = rbind(wd, rel_degree(i))
}
qplot(data = wd, y = reldegree, x = q, group = x,
alpha = I(.5), color = I("grey"), geom = "line") +
geom_line(data = subset(wd, x %in% c("recherche-en-soins-infirmiers",
"actes-de-la-recherche-en-sciences-sociales",
"societes",
"sociologie",
"francaise-de-sociologie",
"reseaux",
"geneses")),
aes(color = x), alpha = 1, size = 1) +
scale_color_brewer("", palette = "Set1") +
labs(y = "Degré de centralité relatif\n", x = "\nSeuil d'intensité des liens entre les revues") +
theme_bw() +
theme(panel.grid = element_blank())
ggsave("degree_all.png", width = 11, height = 8)
#
# load packages
#
library(dplyr)
library(ggplot2)
library(Hmisc) # wtd.mean and wtd.quantile
library(httr)
library(rvest)
library(scales)
library(stringr)
#
# load master data
#
source("revues-data.r")
#
# summary stats
#
# by journal:
# number of issues and unique authors
by_j = group_by(d, revue) %>%
summarise(min = min(annee), max = max(annee),
numeros = n_distinct(numero),
auteurs = n_distinct(auteur)) %>%
mutate(annees = as.numeric(max) - as.numeric(min) + 1) %>%
arrange(-annees) #%>%
# head(., 5)
# tail(., 5)
# comparable to 'Sociétés'
# filter(annees %in% 12:16 & auteurs %in% 443:463)
# by journal issue:
# average number of authors and articles
by_i = group_by(d, numero) %>%
mutate(auteurs = n_distinct(auteur)) %>%
select(numero, revue, articles, auteurs) %>%
unique %>%
group_by(revue) %>%
summarise(mu_articles = mean(articles), mu_auteurs = mean(auteurs))
#
# journals per author
#
dd = d %>%
group_by(auteur) %>%
mutate(n = n_distinct(revue)) %>%
group_by(revue) %>%
summarise(mu = mean(n), sd = sd(n), n = n()) %>%
arrange(-mu) %>%
mutate(rank = row_number(), rankp = percent_rank(mu))
# some example journals
dd$tagged = NA
dd$tagged[ dd$revue == "societes" ] = "Sociétés"
dd$tagged[ dd$revue == "sociologie" ] = "Sociologie"
dd$tagged[ dd$revue == "droit-et-societe" ] = "Droit et Société"
dd$tagged[ dd$revue == "reseaux" ] = "Réseaux"
dd$tagged[ dd$revue == "population" ] = "Population"
qplot(data = dd, y = rankp, x = mu, geom = "step") +
geom_point(data = subset(dd, !is.na(tagged))) +
geom_segment(data = subset(dd, !is.na(tagged)),
aes(xend = mu + .125, yend = rankp), lty = "dotted") +
geom_text(data = subset(dd, !is.na(tagged)),
aes(x = mu + .15, label = tagged), hjust = 0, fontface = "italic") +
xlim(1, 4.25) +
scale_y_continuous(label = percent_format()) +
labs(x = "\nNombre moyen de revues par auteur", y = "Fréquence cumulée\n") +
theme_bw()
ggsave("revues_ecdf.png", width = 10, height = 9)
# more comparable journals
dd$tagged = NA
dd$tagged[ dd$revue == "societes" ] = "Sociétés"
dd$tagged[ dd$revue == "geneses" ] = "Genèses"
dd$tagged[ dd$revue == "cahiers-d-etudes-africaines" ] = "Cahiers d'études africaines"
qplot(data = dd, y = rankp, x = mu, geom = "step") +
geom_point(data = subset(dd, !is.na(tagged))) +
geom_segment(data = subset(dd, !is.na(tagged)),
aes(xend = mu + .125, yend = rankp), lty = "dotted") +
geom_text(data = subset(dd, !is.na(tagged)),
aes(x = mu + .15, label = tagged), hjust = 0, fontface = "italic") +
xlim(1, 4.25) +
scale_y_continuous(label = percent_format()) +
labs(x = "\nNombre moyen de revues par auteur", y = "Fréquence cumulée\n") +
theme_bw()
ggsave("revues_comparables.png", width = 10, height = 9)
# sd increases with mean
qplot(data = dd, y = mu, x = sd) +
labs(y = "Nombre moyen de revues par auteur\n", x = "\nÉcart-type") +
geom_smooth(method = "loess", se = FALSE) +
theme_bw()
ggsave("revues_mu_sd.png", width = 10, height = 9)
#
# 'Sociétés' journal
#
# other journals with same authors
t = unique(d[ d$revue == "societes", "auteur" ])
t = as.data.frame(table(d[ d$auteur %in% t, "revue" ]))
t = arrange(t, -Freq)
filter(t, Freq > 9)
# percentage of other journals
(sum(t$Freq) - t$Freq[1]) / sum(t$Freq)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment