Skip to content

Instantly share code, notes, and snippets.

@rcastelo
Last active July 14, 2020 16:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rcastelo/276a2d3fd26d5511aff286ec8cf1e046 to your computer and use it in GitHub Desktop.
Save rcastelo/276a2d3fd26d5511aff286ec8cf1e046 to your computer and use it in GitHub Desktop.
Bioconductor downloads and dependencies
library(igraph)
library(BiocPkgTools)
d <- biocDownloadStats()
pkgs <- biocPkgList()
whmaintainerpkgs <- grep("maintainer", pkgs$Maintainer)
coreteam <- c("interdonato", "michafla", "jmacdon",
"morgan", "andrzej", "pages",
"ramos", "shepherd", "grimbough",
"turaga", "VanTwisk", "jwang96")
whcoreteampkgs <- grep(paste(coreteam, collapse="|"), pkgs$Maintainer)
corepkgs1 <- pkgs$Package[unique(c(whmaintainerpkgs, whcoreteampkgs))]
biocviews <- sapply(pkgs$biocViews, paste, collapse="_")
whcorepkgs2 <- grep("DataRepresentation", biocviews)
corepkgs <- unique(c(corepkgs1, corepkgs2))
dep_df <- buildPkgDependencyDataFrame(repo="BioCsoft",
dependencies=c("Depends", "Imports"))
g <- buildPkgDependencyIgraph(dep_df)
## add vertices for Bioconductor packages without dependencies
g <- g + vertices(pkgs$Package[!pkgs$Package %in% names(V(g))])
## exclude packages outside Bioconductor
excludedpkgs <- names(V(g))[!names(V(g)) %in% pkgs$Package]
g <- induced_subgraph(g, setdiff(names(V(g)), excludedpkgs))
## define non-core Bioconductor packages
noncorepkgs <- setdiff(pkgs$Package, corepkgs)
## calculate for non-core Bioconductor packages their
## average monthly download in the last 12 months, the total
## number of dependences and the number dependences to
## "core infrastructure packages"
res <- data.frame(Downloads=integer(length(noncorepkgs)),
Ndeps=integer(length(noncorepkgs)),
Ncoredeps=integer(length(noncorepkgs)),
row.names=noncorepkgs, check.names=FALSE)
for (p in noncorepkgs) {
## fetch dependences
deps <- setdiff(names(subcomponent(g, p, mode="out")), p)
## fetch number of dependences to "core infrastructure packages"
ncoredeps <- sum(deps %in% corepkgs)
## fetch median number of downloads through the last 12 months
d.pkg <- d[d$Package %in% p & d$Month != "all", ]
lastfullmonth <- Sys.Date() - as.POSIXlt(Sys.Date())$mday
mask <- d.pkg$repo == "Software" &
d.pkg$Date >= (lastfullmonth-365) & d.pkg$Date <= lastfullmonth
d.pkg <- d.pkg$Nb_of_distinct_IPs[mask]
d.pkg <- d.pkg[d.pkg > 0]
if (length(d.pkg) == 12)
res[p, ] <- c(median(d.pkg), length(deps), ncoredeps)
else ## if a package has no download data for the last 12 months set NA
res[p, ] <- c(NA, length(deps), ncoredeps)
}
## discard new packages with no download data for the last 12 months
res <- res[!is.na(res$Downloads), ]
saveRDS(res, file="downloadsbydeps.rds")
ncd <- cut(res$Ncoredeps, breaks=c(0, 1, max(res$Ncoredeps)),
right=FALSE, include.lowest=TRUE)
plot(log10(res$Downloads) ~ ncd, xlab="Number of core dependences",
ylab="log10 Monthly downloads", las=1)
points(1:2+0.1, tapply(log10(res$Downloads), ncd, mean), pch=23, bg="black")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment