Last active
August 29, 2015 14:16
-
-
Save briatte/0bc4065d962d816caef3 to your computer and use it in GitHub Desktop.
scrape author metadata from 'sociology' journals on cairn.info – start with revues.r
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
data = "revues-soc.csv" | |
if(!file.exists(data)) { | |
dir.create("rev", showWarnings = FALSE) | |
dir.create("num", showWarnings = FALSE) | |
r = "http://www.cairn.info/" | |
download.file(paste0(r, "discipline.php?POS=11&TITRE=ALL"), | |
"revues-soc.html", mode = "wb", quiet = TRUE) | |
y = html("revues-soc.html") | |
y = html_nodes(y, ".revue a") %>% html_attr("href") %>% unique | |
for(i in y) { | |
u = paste0(r, i) | |
f = paste0("rev/", i) | |
# first page | |
cat("Parsing:", f, "\n") | |
if(!file.exists(f)) | |
download.file(u, f, mode = "wb", quiet = TRUE) | |
h = html(f) | |
# possible additional pages | |
p = html_nodes(h, ".pager .nb a") %>% html_attr("href") | |
for(j in p) { | |
u = paste0(r, j) | |
f = gsub("\\.htm$", paste0("-", which(p == j) + 1, ".htm"), i) | |
f = paste0("rev/", f) | |
cat("Parsing:", f, "\n") | |
if(!file.exists(f)) | |
download.file(u, f, mode = "wb", quiet = TRUE) | |
} | |
} | |
d = data.frame() | |
for(i in dir("rev", pattern = "^revue", full.names = TRUE)) { | |
h = html(i) | |
# issues | |
n = h %>% | |
html_nodes(".list_numeros a") %>% | |
html_attr("href") %>% | |
unique | |
cat("\n", i, ":", length(n), "issue(s)\n") | |
for(j in rev(gsub("\\s", "%20", n))) { | |
cat(sprintf("%3.0f", which(n == j)), j) | |
u = paste0(r, j) | |
f = paste0("num/", j) | |
# some issues are missing | |
if(!file.exists(f)) | |
h = try(download.file(u, f, mode = "wb", quiet = TRUE), silent = TRUE) | |
if("try-error" %in% class(h) | !file.info(f)$size) { | |
cat(": failed\n") | |
} else { | |
h = html(f) | |
# unique authors | |
a = h %>% | |
html_nodes(".list_articles .authors a") %>% | |
html_attr("href") %>% | |
unique | |
# simplify URLs | |
a = gsub("publications-de-|\\.htm$", "", a[ a != "" ]) | |
# unique articles | |
l = str_trim(h %>% html_nodes(".list_articles .authors") %>% html_text()) | |
l = length(l[ l!= "" ]) | |
cat(":", l, "articles(s)", length(a), "author(s)\n") | |
# add to dataset | |
if(length(a)) | |
d = rbind(d, data.frame(numero = f, | |
revue = gsub("num/revue-|-\\d{4}-(.*)\\.htm", "", f), | |
annee = str_extract(f, "[0-9]{4}"), | |
auteur = a, | |
articles = l, | |
stringsAsFactors = FALSE)) | |
} | |
} | |
} | |
write.csv(d, data, row.names = FALSE) | |
} | |
d = read.csv(data, stringsAsFactors = FALSE) | |
table(d$annee) | |
sum = group_by(d, numero) %>% summarise(sum = unique(articles)) | |
cat(n_distinct(d$revue), "journals", | |
sum(sum$sum), "articles", | |
n_distinct(d$numero), "issues", | |
n_distinct(d$auteur), "authors\n") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# number of articles per author | |
n_a = group_by(d, auteur) %>% | |
summarise(n_articles = n()) | |
# author concentration index | |
h_a = d[, c("auteur", "revue") ] %>% | |
group_by(auteur, revue) %>% | |
# number of articles in each journal | |
summarise(n = n()) %>% | |
group_by(auteur) %>% | |
# journal shares by author | |
mutate(s = n / sum(n)) %>% | |
# Herfindahl–Hirschman Index | |
summarise(hhi = sum(s^2), n_revues = n()) | |
# join | |
a = full_join(n_a, h_a) %>% | |
arrange(hhi) | |
# HHI will vary between 1/14 and 1 | |
summary(a$n_revues) | |
# example authors with tons of articles in just one journal | |
filter(a, hhi == 1 & n_articles > 50) | |
# concentration decreases with number of journals per author | |
qplot(data = a, y = hhi, x = factor(n_revues), | |
geom = "boxplot") + | |
labs(y = "Indice de Herfindahl-Hirschman\n", | |
x = "\nNombre de revues par auteur") + | |
theme_bw() | |
ggsave("revues_hhi.png", width = 10, height = 9) | |
# percentage of single-journal authors + unweighted and weighted journal HHI | |
# weighted HHI is weighted by inverse frequency of author by journal | |
h = full_join(d[, c("revue", "auteur") ] %>% | |
group_by(revue, auteur) %>% | |
mutate(weight = n()) %>% | |
unique %>% | |
group_by(revue) %>% | |
mutate(weight = weight / sum(weight)), | |
a[, c("auteur", "hhi", "n_revues") ]) %>% | |
group_by(revue) %>% | |
summarise( | |
n_auts = n(), | |
p_mono = sum(n_revues == 1) / n(), | |
wt_hhi = weighted.mean(hhi, weight), | |
mu_hhi = mean(hhi), | |
sd_hhi = sd(hhi), | |
min_hhi = min(hhi), | |
max_hhi = max(hhi)) %>% | |
arrange(mu_hhi) | |
# weighting produces almost no correction | |
cor(h$mu_hhi, h$wt_hhi) | |
# ordered HHI pointranges, separated by quartiles | |
h$nq = cut(h$mu_hhi, quantile(h$mu_hhi), include.lowest = TRUE, dig.lab = 2) | |
qplot(data = h, x = reorder(revue, -mu_hhi), | |
y = mu_hhi, ymin = mu_hhi - sd_hhi, ymax = mu_hhi + sd_hhi, | |
color = nq, | |
geom = "pointrange") + | |
# geom_point(aes(y = wt_hhi), color = "grey") + | |
coord_flip() + | |
scale_color_brewer("Quartile", palette = "RdBu") + | |
labs(x = NULL, y = "\nIndice de Herfindahl-Hirschman moyen ± 1 écart-type") + | |
theme_bw() + | |
theme(legend.position = "bottom") | |
ggsave("revues_hhi_rank.png", width = 10, height = 16) | |
ggsave("revues_hhi_rank.pdf", width = 10, height = 16) | |
# correlation between HHI and mono-journal author percentage | |
with(h, cor(p_mono, wt_hhi)) | |
with(h, cor(p_mono, mu_hhi)) | |
# ordered mono-journal author percentage, separated by quartiles | |
h$nq = cut(100 * h$p_mono, quantile(100 * h$p_mono), include.lowest = TRUE, dig.lab = 2) | |
qplot(data = h, x = reorder(revue, -p_mono), | |
y = p_mono, color = nq) + | |
coord_flip() + | |
scale_color_brewer("Quartile", palette = "RdBu") + | |
scale_y_continuous(label = percent_format()) + | |
labs(x = NULL, y = "\nPourcentage d'auteurs ayant publié dans cette seule revue") + | |
theme_bw() + | |
theme(legend.position = "bottom") | |
ggsave("revues_mono_rank.png", width = 10, height = 16) | |
ggsave("revues_mono_rank.pdf", width = 10, height = 16) | |
# 'blind' percentage of editorial concentration, by journal | |
e = data.frame() | |
for(i in unique(d$revue)) { | |
t = unique(d[ d$revue == i, "auteur" ]) | |
t = as.data.frame(table(d[ d$auteur %in% t, "revue" ])) | |
t = 1 - (sum(t$Freq) - t$Freq[ t$Var1 == i ]) / sum(t$Freq) | |
e = rbind(e, data.frame(revue = i, t, stringsAsFactors = FALSE)) | |
} | |
e = full_join(h, arrange(e, -t)) | |
# correlation to HHI and mono-journal author percentage | |
with(e, cor(p_mono, t)) | |
with(e, cor(mu_hhi, t)) | |
# ordered editorial concentration rate, separated by quartiles | |
e$nq = cut(100 * e$t, quantile(100 * e$t), include.lowest = TRUE, dig.lab = 2) | |
qplot(data = e, x = reorder(revue, -t), | |
y = t, color = nq) + | |
coord_flip() + | |
scale_color_brewer("Quartile", palette = "RdBu") + | |
scale_y_continuous(label = percent_format()) + | |
labs(x = NULL, y = "\nTaux de concentration de la production écrite des auteurs") + | |
theme_bw() + | |
theme(legend.position = "bottom") | |
ggsave("revues_conc_rank.png", width = 10, height = 16) | |
ggsave("revues_conc_rank.pdf", width = 10, height = 16) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(animation) | |
library(network) | |
library(GGally) | |
library(sna) # unweighted degree | |
library(tnet) # weighted degree | |
dir.create("plots", showWarnings = FALSE) | |
dir.create("csv", showWarnings = FALSE) | |
# | |
# shared authors between journals, weighted by articles | |
# (run revues-data.r first) | |
# | |
k = paste0("revue-", unique(d$revue)) | |
for(j in k) { | |
n = dir("num", pattern = paste0("^", j, "-\\d{4}"), full.names = TRUE) | |
n = n[ file.info(n)$size > 0 ] | |
f = paste0("csv/", j, ".csv") | |
if(!file.exists(f)) { | |
cat("\n", j, ":", length(n), "issue(s)\n") | |
r = data.frame() | |
for(i in rev(n)) { | |
cat(sprintf("%3.0f", which(n == i)), i) | |
h = html(i) | |
# unique authors per article | |
a = html_nodes(h, ".list_articles .authors") | |
a = sapply(a, function(x) html_nodes(x, "a") %>% | |
html_attr("href") %>% | |
unique %>% | |
paste0(., collapse = ";")) | |
# simplify URLs | |
a = gsub("publications-de-|\\.htm", "", a[ a != "" ]) | |
cat(":", length(a), "articles(s)\n") | |
# add to dataset | |
if(length(a)) | |
r = rbind(r, data.frame(numero = i, | |
revue = gsub("num/revue-|-\\d{4}-(.*)\\.htm", "", i), | |
annee = str_extract(i, "[0-9]{4}"), | |
auteurs = a, | |
stringsAsFactors = FALSE)) | |
} | |
write.csv(r, f, row.names = FALSE) | |
} | |
} | |
r = dir("csv", pattern = "^revue-", full.names = TRUE) | |
r = lapply(r, read.csv, stringsAsFactors = FALSE) | |
r = bind_rows(r) | |
# | |
# journal-specific edge lists | |
# | |
a = strsplit(r$auteurs, ";") | |
e = data.frame() | |
for(i in unique(r$revue)) { | |
y = unique(unlist(strsplit(r$auteurs[ r$revue == i ], ";"))) | |
cat(i, ":", length(y), "authors ") | |
y = r[ which(sapply(a, function(x) any(x %in% y))), ] | |
cat(nrow(y), "articles", n_distinct(y$revue), "journals\n") | |
y = as.data.frame(table(y$revue)) | |
y$Freq = y$Freq / y$Freq[ y$Var1 == i ] | |
y = y[ y$Var1 != i, ] | |
e = rbind(e, data.frame(i, j = y$Var1, w = y$Freq, stringsAsFactors = FALSE)) | |
} | |
# | |
# weighted edge list | |
# | |
n = apply(e[, 1:2 ], 1, function(x) paste0(sort(x), collapse = "///")) | |
n = data.frame(n, w = e$w) | |
e = aggregate(w ~ n, sum, data = n) | |
e = data.frame(i = gsub("(.*)///(.*)", "\\1", e$n), | |
j = gsub("(.*)///(.*)", "\\2", e$n), | |
w = e$w) | |
e = mutate(e, ecdf = cume_dist(w)) | |
qplot(data = e, x = w, y = ecdf, color = I("grey")) + | |
scale_x_log10() + | |
labs(y = "Fréquence cumulée\n", x = "\nIntensité des liens (logarithme base 10)") + | |
theme_bw() | |
ggsave("ecdf.png", width = 9, height = 9) | |
e = filter(e, w < 1) %>% mutate(ecdf = cume_dist(w)) | |
plot_ecdf = function(x) { | |
g = qplot(data = subset(e, i != x & j != x), | |
x = w, y = ecdf, color = I("grey")) + | |
geom_point(data = subset(e, (i == x | j == x)), color = "black") + | |
geom_rug(data = subset(e, (i == x | j == x)), sides = "b") + | |
scale_color_manual(values = c("TRUE" = "black", "FALSE" = "grey")) + | |
scale_x_log10() + | |
labs(y = "Fréquence cumulée\n", x = "\nIntensité des liens (logarithme base 10)") + | |
guides(color = FALSE) + | |
theme_bw() | |
print(g) | |
} | |
plot_ecdf("geneses") | |
ggsave("ecdf_geneses.png", width = 9, height = 9) | |
plot_ecdf("societes") | |
plot_ecdf("sociologie") | |
# | |
# animated networks | |
# | |
plot_network <- function(x) { | |
saveGIF({ | |
for(q in c(0, .1, .2, .3, .4, .5, .6, .7, .8, .9, | |
.91, .92, .93, .94, .95, .96, .97, .98, .99)) { | |
t = quantile(e$w, q) | |
n = network(e[ e$w > t, 1:2 ], directed = FALSE) | |
set.edge.attribute(n, "weight", e[ e$w > t, 3 ]) | |
n %v% "size" = 1 + as.numeric(cut(degree(n), unique(quantile(degree(n))), include.lowest = TRUE)) | |
n %v% "id" = ifelse(network.vertex.names(n) == x, "S", "A") | |
colors = c("A" = "black", "S" = "red") | |
colors = colors[ names(colors) %in% unique(n %v% "id") ] | |
n %e% "alpha" = as.numeric(cut(n %e% "weight", unique(quantile(n %e% "weight")), include.lowest = TRUE)) | |
n %e% "alpha" = n %e% "alpha" / max(n %e% "alpha") | |
g = ggnet(n, size = 3, | |
node.group = n %v% "id", node.color = colors, | |
segment.alpha = n %e% "alpha", segment.color = "grey33", | |
label.size = 6) + # label.size = n %v% "size", | |
ggtitle(paste("seuil =", q, "\n", network.size(n), "revues")) + | |
guides(color = FALSE) | |
print(g) | |
} | |
}, movie.name = paste0("network_", x, ".gif")) | |
} | |
plot_network("societes") | |
plot_network("sociologie") | |
# | |
# relative centrality at each threshold level | |
# | |
rel_degree <- function(x) { | |
wd = data.frame() | |
for(q in seq(0, .99, .01)) { | |
t = quantile(e$w, q) | |
t = e[ e$w > t, 1:3 ] | |
t$i = as.character(t$i) | |
t$j = as.character(t$j) | |
l = unique(c(t$i, t$j)) | |
t$i = as.numeric(factor(t$i, levels = l)) | |
t$j = as.numeric(factor(t$j, levels = l)) | |
t = symmetrise_w(as.tnet(t, type = "weighted one-mode tnet")) | |
y = which(l == x) | |
if(!length(y)) | |
wd = rbind(wd, data.frame(x, q, reldegree = 0, stringsAsFactors = FALSE)) | |
else | |
wd = rbind(wd, data.frame(x, q, | |
reldegree = degree_w(t)[ y, 2 ] / max(degree_w(t)[, 2]), | |
stringsAsFactors = FALSE)) | |
} | |
wd | |
} | |
wd = data.frame() | |
j = unique(r$revue) | |
for(i in rev(j)) { | |
cat(sprintf("%3.0f", which(j == i)), i, "\n") | |
wd = rbind(wd, rel_degree(i)) | |
} | |
qplot(data = wd, y = reldegree, x = q, group = x, | |
alpha = I(.5), color = I("grey"), geom = "line") + | |
geom_line(data = subset(wd, x %in% c("recherche-en-soins-infirmiers", | |
"actes-de-la-recherche-en-sciences-sociales", | |
"societes", | |
"sociologie", | |
"francaise-de-sociologie", | |
"reseaux", | |
"geneses")), | |
aes(color = x), alpha = 1, size = 1) + | |
scale_color_brewer("", palette = "Set1") + | |
labs(y = "Degré de centralité relatif\n", x = "\nSeuil d'intensité des liens entre les revues") + | |
theme_bw() + | |
theme(panel.grid = element_blank()) | |
ggsave("degree_all.png", width = 11, height = 8) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# load packages | |
# | |
library(dplyr) | |
library(ggplot2) | |
library(Hmisc) # wtd.mean and wtd.quantile | |
library(httr) | |
library(rvest) | |
library(scales) | |
library(stringr) | |
# | |
# load master data | |
# | |
source("revues-data.r") | |
# | |
# summary stats | |
# | |
# by journal: | |
# number of issues and unique authors | |
by_j = group_by(d, revue) %>% | |
summarise(min = min(annee), max = max(annee), | |
numeros = n_distinct(numero), | |
auteurs = n_distinct(auteur)) %>% | |
mutate(annees = as.numeric(max) - as.numeric(min) + 1) %>% | |
arrange(-annees) #%>% | |
# head(., 5) | |
# tail(., 5) | |
# comparable to 'Sociétés' | |
# filter(annees %in% 12:16 & auteurs %in% 443:463) | |
# by journal issue: | |
# average number of authors and articles | |
by_i = group_by(d, numero) %>% | |
mutate(auteurs = n_distinct(auteur)) %>% | |
select(numero, revue, articles, auteurs) %>% | |
unique %>% | |
group_by(revue) %>% | |
summarise(mu_articles = mean(articles), mu_auteurs = mean(auteurs)) | |
# | |
# journals per author | |
# | |
dd = d %>% | |
group_by(auteur) %>% | |
mutate(n = n_distinct(revue)) %>% | |
group_by(revue) %>% | |
summarise(mu = mean(n), sd = sd(n), n = n()) %>% | |
arrange(-mu) %>% | |
mutate(rank = row_number(), rankp = percent_rank(mu)) | |
# some example journals | |
dd$tagged = NA | |
dd$tagged[ dd$revue == "societes" ] = "Sociétés" | |
dd$tagged[ dd$revue == "sociologie" ] = "Sociologie" | |
dd$tagged[ dd$revue == "droit-et-societe" ] = "Droit et Société" | |
dd$tagged[ dd$revue == "reseaux" ] = "Réseaux" | |
dd$tagged[ dd$revue == "population" ] = "Population" | |
qplot(data = dd, y = rankp, x = mu, geom = "step") + | |
geom_point(data = subset(dd, !is.na(tagged))) + | |
geom_segment(data = subset(dd, !is.na(tagged)), | |
aes(xend = mu + .125, yend = rankp), lty = "dotted") + | |
geom_text(data = subset(dd, !is.na(tagged)), | |
aes(x = mu + .15, label = tagged), hjust = 0, fontface = "italic") + | |
xlim(1, 4.25) + | |
scale_y_continuous(label = percent_format()) + | |
labs(x = "\nNombre moyen de revues par auteur", y = "Fréquence cumulée\n") + | |
theme_bw() | |
ggsave("revues_ecdf.png", width = 10, height = 9) | |
# more comparable journals | |
dd$tagged = NA | |
dd$tagged[ dd$revue == "societes" ] = "Sociétés" | |
dd$tagged[ dd$revue == "geneses" ] = "Genèses" | |
dd$tagged[ dd$revue == "cahiers-d-etudes-africaines" ] = "Cahiers d'études africaines" | |
qplot(data = dd, y = rankp, x = mu, geom = "step") + | |
geom_point(data = subset(dd, !is.na(tagged))) + | |
geom_segment(data = subset(dd, !is.na(tagged)), | |
aes(xend = mu + .125, yend = rankp), lty = "dotted") + | |
geom_text(data = subset(dd, !is.na(tagged)), | |
aes(x = mu + .15, label = tagged), hjust = 0, fontface = "italic") + | |
xlim(1, 4.25) + | |
scale_y_continuous(label = percent_format()) + | |
labs(x = "\nNombre moyen de revues par auteur", y = "Fréquence cumulée\n") + | |
theme_bw() | |
ggsave("revues_comparables.png", width = 10, height = 9) | |
# sd increases with mean | |
qplot(data = dd, y = mu, x = sd) + | |
labs(y = "Nombre moyen de revues par auteur\n", x = "\nÉcart-type") + | |
geom_smooth(method = "loess", se = FALSE) + | |
theme_bw() | |
ggsave("revues_mu_sd.png", width = 10, height = 9) | |
# | |
# 'Sociétés' journal | |
# | |
# other journals with same authors | |
t = unique(d[ d$revue == "societes", "auteur" ]) | |
t = as.data.frame(table(d[ d$auteur %in% t, "revue" ])) | |
t = arrange(t, -Freq) | |
filter(t, Freq > 9) | |
# percentage of other journals | |
(sum(t$Freq) - t$Freq[1]) / sum(t$Freq) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment