blmoore/plos_authInflation.R

## plos_authInflation.R
options(PlosApiKey = "<insert your API key here!>")
#install_github("rplos", "ropensci")
library("rplos")
library("ggplot2")
require("dplyr")

# Convert author strings to counts
countAuths <- function(cell)
  length(unlist(strsplit(cell, ";")))

countAuths <- Vectorize(countAuths)

# Query PLoS API for 1k papers per journal per year,
# count the number of authors and return a data.frame
getAuths <- function(j, lim=1000, start.year=2006){
  cat("Getting results for journal: ", j, "\n")
  # seem to be in reverse order by year?
  results <- sapply(start.year:2013, function(i) data.frame(year = i,
                auths = searchplos(
                  q  = paste0('publication_date:[', i,
                              '-01-01T00:00:00Z TO ', i,
                              '-12-31T23:59:59Z]'),
                  fl = "author",
                  fq = list("doc_type:full",
                            paste0("cross_published_journal_key:", j)),
                 start=0, limit=lim, sleep=6),
                year=i), simplify=F)
  results <- do.call(rbind, results)
  results$counts <- countAuths(results$author)
  results$journal <- j
  results
}

journals <- journalnamekey()
plos.all <- sapply(journals[c(1:5, 7)], getAuths, simplify=F)
plos <- do.call(rbind, plos.all)

# Fig. 1: Bean plot showing distribution of author counts
#         per journal overall
ggplot(plos, aes(x=journal, y=counts, fill=journal)) +
  geom_violin(scale="width") +
  geom_boxplot(width=.12, fill=I("black"), notch=T,
               outlier.size=NA, col="grey40") +
  stat_summary(fun.y="median", geom="point", shape=20, col="white") +
  scale_y_log10(breaks=c(1:5, seq(10, 50, by=10), 100, 200, 300)) +
  coord_flip() + labs(x="", y="Number of authors per paper") +
  theme_classic() + theme(legend.position="none") +
  scale_fill_brewer()

# Fig 2. ECDFs of the author count distributions
# 5in x 5in
ggplot(plos, aes(x=counts, col=journal)) +
  stat_ecdf(geom="smooth", se=F, size=1.2) + theme_bw() +
  scale_x_log10(breaks=c(1:5, seq(10, 50, by=10), 100, 200, 300)) +
  theme(legend.position=c(.75,.33)) +
  labs(x="Number of authors per paper", y="ECDF",
       col="") + coord_cartesian(xlim=c(1,300)) +
  scale_color_brewer(type="qual", palette=6)

# Fig 3. Trends in author counts over time with
#        confidence limits on the means
# 7 x 7
ggplot(plos, aes(x=year, y=counts, col=journal, fill=journal)) +
  stat_summary(fun.data="mean_cl_boot", geom="ribbon",
               width=.2, alpha=I(.5)) +
  stat_summary(fun.y="mean", geom="line") +
  labs(list(x="Year", y="Mean number of authors per paper")) +
  theme_bw() + theme(legend.position=c(.2,.85)) +
  scale_fill_brewer(type="qual", palette=2,
                    guide=guide_legend(direction="vertical",
                                       label.position="bottom",
                                       title=NULL, ncol=2,
                                       label.hjust=0.5)) +
  scale_color_brewer(type="qual", palette=2, guide="none")


# from http://stackoverflow.com/a/17024184/1274516
# show regression equation on each graph facet
lm_eqn  <-  function(df){
  m  <- summary(lm(counts ~ year, df))
  eq <- substitute(~~y~"="~beta*x+i~(R^2==r2),
                   list(beta = format(m$coefficients[2,"Estimate"],
digits = 3),
  i = format(m$coefficients[1,"Estimate"], digits=3),
  r2 = format(m$r.squared, digits=2)))
  as.character(as.expression(eq))
}

means <- group_by(plos, journal, year) %.% summarise(counts=mean(counts))
b <- by(means, means$journal, lm_eqn)
df <- data.frame(beta=unclass(b), journal=names(b))
summary(lm(counts ~ year + journal, data=means))

means <- group_by(means, journal) %.% summarise(m=max(counts))
df$top <- means$m * 1.2

# Fig 4. Facetted linear regression of author inflation per journal
# 6 x 8.5
ggplot(plos, aes(x=year, y=counts, col=journal, fill=journal)) +
  stat_summary(fun.data="mean_cl_boot", geom="errorbar",
               width=.2, alpha=I(.5)) +
  stat_summary(fun.y="mean", geom="point") +
  #stat_summary(fun.y="median", geom="point", shape=4) +
  facet_wrap(~journal, scales="free_y") +
  geom_smooth(method="lm") +
  scale_x_continuous(breaks=2006:2013) +
  labs(list(x="", y="Mean number of authors per paper")) +
  theme_bw() + theme(axis.text.x=element_text(angle=45, hjust=1)) +
  scale_fill_brewer(type="qual", palette=2, guide="none") +
  scale_color_brewer(type="qual", palette=2, guide="none") +
  geom_text(data=df, aes(x=2009.5, y=top, label=beta), size=3, parse=T)

# Overall estimate of author inflation?
# .21 extra authors per paper per year, on average
s <- summary(lm(counts ~ year + journal, data=plos))


# Summary barchart data:
bc <- data.frame(journal = unique(means$journal),
                 trend   = c(0.2490979,
                             0.1211823,
                             0.5201688,
                             0.4088536,
                             0.05894102,
                             0.1828939),
                 std.err = c(0.08224567,
                             0.02213142,
                             0.1493662,
                             0.06361849,
                             0.03891493,
                             0.03798822),
                 IF      = c(12.690,
                             4.867,
                             8.517,
                             15.253,
                             3.730,
                             8.136))

bc$journal <- factor(bc$journal, levels=bc$journal[order(bc$trend)])

# Fig 5. Barchart of author inflation estimate per journal.
# 7 x 5
ggplot(bc, aes(x=journal, y=trend, fill=journal, ymin=trend-std.err,
               ymax=trend+std.err)) +
  geom_bar(stat="identity") +
  geom_errorbar(width=.2) +
  scale_y_continuous(expand=c(0,0)) +
  theme_classic() +
  labs(x="",
       y="Estimate of annual author inflation (additional mean authors per paper)") +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  scale_fill_brewer(palette="Blues", guide="none")

pcc <- cor(bc$trend, bc$IF)
# Fig 6. Correlation of author inflation and journal impact factors.
# 5 x 5
ggplot(bc, aes(x=trend, y=IF, col=journal)) +
  geom_text(aes(label=journal)) + xlim(0,.6) +
  labs(x="Author inflation estimate",
       y="Journal impact factor (2012)") +
  scale_color_brewer(type="qual", palette=2, guide="none") +
  annotate("text", x=.05, y=15,
           label=paste0("rho == ", format(pcc, digits=2)), parse=T)

# N.S. (p = 0.18)
cor.test(bc$trend, bc$IF)
	options(PlosApiKey = "<insert your API key here!>")
	#install_github("rplos", "ropensci")
	library("rplos")
	library("ggplot2")
	require("dplyr")

	# Convert author strings to counts
	countAuths <- function(cell)
	length(unlist(strsplit(cell, ";")))

	countAuths <- Vectorize(countAuths)

	# Query PLoS API for 1k papers per journal per year,
	# count the number of authors and return a data.frame
	getAuths <- function(j, lim=1000, start.year=2006){
	cat("Getting results for journal: ", j, "\n")
	# seem to be in reverse order by year?
	results <- sapply(start.year:2013, function(i) data.frame(year = i,
	auths = searchplos(
	q = paste0('publication_date:[', i,
	'-01-01T00:00:00Z TO ', i,
	'-12-31T23:59:59Z]'),
	fl = "author",
	fq = list("doc_type:full",
	paste0("cross_published_journal_key:", j)),
	start=0, limit=lim, sleep=6),
	year=i), simplify=F)
	results <- do.call(rbind, results)
	results$counts <- countAuths(results$author)
	results$journal <- j
	results
	}

	journals <- journalnamekey()
	plos.all <- sapply(journals[c(1:5, 7)], getAuths, simplify=F)
	plos <- do.call(rbind, plos.all)

	# Fig. 1: Bean plot showing distribution of author counts
	# per journal overall
	ggplot(plos, aes(x=journal, y=counts, fill=journal)) +
	geom_violin(scale="width") +
	geom_boxplot(width=.12, fill=I("black"), notch=T,
	outlier.size=NA, col="grey40") +
	stat_summary(fun.y="median", geom="point", shape=20, col="white") +
	scale_y_log10(breaks=c(1:5, seq(10, 50, by=10), 100, 200, 300)) +
	coord_flip() + labs(x="", y="Number of authors per paper") +
	theme_classic() + theme(legend.position="none") +
	scale_fill_brewer()

	# Fig 2. ECDFs of the author count distributions
	# 5in x 5in
	ggplot(plos, aes(x=counts, col=journal)) +
	stat_ecdf(geom="smooth", se=F, size=1.2) + theme_bw() +
	scale_x_log10(breaks=c(1:5, seq(10, 50, by=10), 100, 200, 300)) +
	theme(legend.position=c(.75,.33)) +
	labs(x="Number of authors per paper", y="ECDF",
	col="") + coord_cartesian(xlim=c(1,300)) +
	scale_color_brewer(type="qual", palette=6)

	# Fig 3. Trends in author counts over time with
	# confidence limits on the means
	# 7 x 7
	ggplot(plos, aes(x=year, y=counts, col=journal, fill=journal)) +
	stat_summary(fun.data="mean_cl_boot", geom="ribbon",
	width=.2, alpha=I(.5)) +
	stat_summary(fun.y="mean", geom="line") +
	labs(list(x="Year", y="Mean number of authors per paper")) +
	theme_bw() + theme(legend.position=c(.2,.85)) +
	scale_fill_brewer(type="qual", palette=2,
	guide=guide_legend(direction="vertical",
	label.position="bottom",
	title=NULL, ncol=2,
	label.hjust=0.5)) +
	scale_color_brewer(type="qual", palette=2, guide="none")


	# from http://stackoverflow.com/a/17024184/1274516
	# show regression equation on each graph facet
	lm_eqn <- function(df){
	m <- summary(lm(counts ~ year, df))
	eq <- substitute(~~y~"="~beta*x+i~(R^2==r2),
	list(beta = format(m$coefficients[2,"Estimate"],
	digits = 3),
	i = format(m$coefficients[1,"Estimate"], digits=3),
	r2 = format(m$r.squared, digits=2)))
	as.character(as.expression(eq))
	}

	means <- group_by(plos, journal, year) %.% summarise(counts=mean(counts))
	b <- by(means, means$journal, lm_eqn)
	df <- data.frame(beta=unclass(b), journal=names(b))
	summary(lm(counts ~ year + journal, data=means))

	means <- group_by(means, journal) %.% summarise(m=max(counts))
	df$top <- means$m * 1.2

	# Fig 4. Facetted linear regression of author inflation per journal
	# 6 x 8.5
	ggplot(plos, aes(x=year, y=counts, col=journal, fill=journal)) +
	stat_summary(fun.data="mean_cl_boot", geom="errorbar",
	width=.2, alpha=I(.5)) +
	stat_summary(fun.y="mean", geom="point") +
	#stat_summary(fun.y="median", geom="point", shape=4) +
	facet_wrap(~journal, scales="free_y") +
	geom_smooth(method="lm") +
	scale_x_continuous(breaks=2006:2013) +
	labs(list(x="", y="Mean number of authors per paper")) +
	theme_bw() + theme(axis.text.x=element_text(angle=45, hjust=1)) +
	scale_fill_brewer(type="qual", palette=2, guide="none") +
	scale_color_brewer(type="qual", palette=2, guide="none") +
	geom_text(data=df, aes(x=2009.5, y=top, label=beta), size=3, parse=T)

	# Overall estimate of author inflation?
	# .21 extra authors per paper per year, on average
	s <- summary(lm(counts ~ year + journal, data=plos))


	# Summary barchart data:
	bc <- data.frame(journal = unique(means$journal),
	trend = c(0.2490979,
	0.1211823,
	0.5201688,
	0.4088536,
	0.05894102,
	0.1828939),
	std.err = c(0.08224567,
	0.02213142,
	0.1493662,
	0.06361849,
	0.03891493,
	0.03798822),
	IF = c(12.690,
	4.867,
	8.517,
	15.253,
	3.730,
	8.136))

	bc$journal <- factor(bc$journal, levels=bc$journal[order(bc$trend)])

	# Fig 5. Barchart of author inflation estimate per journal.
	# 7 x 5
	ggplot(bc, aes(x=journal, y=trend, fill=journal, ymin=trend-std.err,
	ymax=trend+std.err)) +
	geom_bar(stat="identity") +
	geom_errorbar(width=.2) +
	scale_y_continuous(expand=c(0,0)) +
	theme_classic() +
	labs(x="",
	y="Estimate of annual author inflation (additional mean authors per paper)") +
	theme(axis.text.x=element_text(angle=45, hjust=1)) +
	scale_fill_brewer(palette="Blues", guide="none")

	pcc <- cor(bc$trend, bc$IF)
	# Fig 6. Correlation of author inflation and journal impact factors.
	# 5 x 5
	ggplot(bc, aes(x=trend, y=IF, col=journal)) +
	geom_text(aes(label=journal)) + xlim(0,.6) +
	labs(x="Author inflation estimate",
	y="Journal impact factor (2012)") +
	scale_color_brewer(type="qual", palette=2, guide="none") +
	annotate("text", x=.05, y=15,
	label=paste0("rho == ", format(pcc, digits=2)), parse=T)

	# N.S. (p = 0.18)
	cor.test(bc$trend, bc$IF)