Skip to content

Instantly share code, notes, and snippets.

@sje30
Last active May 16, 2020 21:22
Show Gist options
  • Save sje30/6d906c080171eb164689501c9b0232a9 to your computer and use it in GitHub Desktop.
Save sje30/6d906c080171eb164689501c9b0232a9 to your computer and use it in GitHub Desktop.
## How many new articles have been deposited in biorxiv? When new
## papers get uploaded to biorxiv, this information is sent to
## crossref. So, we can use the excellent rcrossref package from the
## ropensci team to get information.
## Note, this does not include revised versions of papers.
##
## Thanks to Scott Chamberlain for providing the magic lines of code to
## grab the information efficiently from crossref.
## Note also there is some code below to grab the infomration from
## http://www.cshsymposium.com/biorxiv/show_all.php
## and on that site you can also generate similar graphs.
require(rcrossref)
system.time(
res <- cr_prefixes(prefixes = "10.1101", works = TRUE, limit = 1000, cursor = "*", cursor_max = 5000,
filter = list(type = "report"), .progress = "text")
)
## colours taken from http://colorbrewer.org
col1 = '#f1eef6'
col2 = '#bdc9e1'
col3 = '#74a9cf'
col4 = '#0570b0'
## note problem with histogram colour changing...
## http://stackoverflow.com/questions/5649600/axis-color-of-date-histogram-in-r
d = as.Date(res$data$created)
##pdf(file="biorxiv_deposits.pdf", width=7, height=4)
svg(file="biorxiv_deposits.svg", width=7, height=4)
par(mar=c(4.5, 4.5, 0, 0.4), cex.axis=0.8)
hist(d, "months", format="%y-%m",main='',
ylab='First submission', xlab='Date',
axes=F,
col=c(rep(col1, 2),
rep(col2, 12),
rep(col3, 12),
rep(col4, 6) #increase post June 2016...
),
freq=TRUE, las=2, ylim=c(0,400))
Axis(d,col="black", side=1)
axis(2, col="black", las=1)
dev.off()
q()
### old code below
cr_prefixes(prefixes = c('10.1101'))
cr_cn(dois="10.1126/science.169.3946.635", format="text")
cr_cn(dois="10.1101/045104", format="text")
## graph at: http://www.cshsymposium.com/biorxiv/usage_monthly.php
## screen scrape via: http://www.cshsymposium.com/biorxiv/show_all.php
res <- cr_prefixes(prefixes = "10.1101", works = TRUE, filter = list(type = "report"), offset=1000, limit=1000)
res$data$DOI
cr_cn(dois="10.1101/012799", format="text")
get_chunk = function(offset=0, limit=20) {
res <- cr_prefixes(prefixes = "10.1101", works = TRUE,
filter = list(type = "report"),
offset=offset, limit=limit)
dois = res$data$DOI
res = cr_works(dois=dois)
l = data.frame(doi=dois, created=res$data$created)
l
}
get_chunks = function(offset=0) {
f = data.frame()
limit = 1000
looking = TRUE
while (looking) {
f2 = get_chunk(offset, limit)
offset = offset + limit
print(offset)
f = rbind(f, f2)
if (nrow(f2) < limit) {
looking = FALSE
}
}
f
}
f3 = get_chunks()
dates = as.Date(f3$created)
hist(dates, "months", format="%d %b")
hist(dates, "months", format="%d %b %y", freq=T)
f2 = get_chunk(offset=3000)
######################################################################
## w3m -dump -cols 999 http://www.cshsymposium.com/biorxiv/show_all.php > o2.txt
dat = readLines("o2.txt")
dat = dat[-(1:6)]
## remove blank lines.
g = grep("^$", dat)
dat = dat[-g]
## find last line "Total Articles, Unique: "
g = grep("^Total Articles, Unique", dat)
## throw away tail.
dat = dat[-(g:length(dat))]
dois = substring(dat, 7, 20)
dates = as.Date(substring(dat, 28, 37))
hist(dates, "months", format="%d-%b-%y", freq=TRUE)
* Application for Frictionless data tool fund
Stephen Eglen, 2020-05-16
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment