Skip to content

Instantly share code, notes, and snippets.

@andrie
Last active February 26, 2020 19:38
Show Gist options
  • Save andrie/05ff90f1f127d488f18cacb828a0e3f8 to your computer and use it in GitHub Desktop.
Save andrie/05ff90f1f127d488f18cacb828a0e3f8 to your computer and use it in GitHub Desktop.
Scrapes CRAN for historical number of packages per release
# Scrapes CRAN archives to determine the number of packages per release
# Create a list of pages to scrape, including both archive and current
extract_url <- function(){
url <- list(
archive = "https://cran-archive.r-project.org/bin/windows/contrib/",
active = "https://cran.r-project.org/bin/windows/contrib/"
)
get_urls <- function(url){
txt <- readLines(url)
idx <- grep("\\d.\\d+/", txt)
txt[idx]
versions <- gsub(".*?>(\\d.\\d+(/)).*", "\\1", txt[idx])
versions
paste0(url, versions)
}
z <- lapply(url, get_urls)
unname(unlist(z))
}
# Given a CRAN URL, extract the number of packages and date
extract_pkg_info <- function(url){
extract_date <- function(txt, fun = max){
txt <- txt[-grep("[(STATUS)|(PACKAGES)](.gz)*", txt)]
pkgs <- grep(".zip", txt)
txt <- txt[pkgs]
ptn <- ".*?>(\\d{2}-...-\\d{4}).*"
idx <- grep(ptn, txt)
date <- gsub(ptn, "\\1", txt[idx])
date <- as.Date(date, format = "%d-%b-%Y")
match.fun(fun)(date)
}
message(url)
txt <- readLines(url)
count <- length(grep(".zip", txt))
# sum(grepl(".zip", txt))
# head(txt)
data.frame(
version = basename(url),
date = extract_date(txt),
pkgs = count
)
}
# Get the list of CRAN URLs
CRAN_urls <- extract_url()
CRAN_urls
# Extract package information
pkgs <- lapply(CRAN_urls, extract_pkg_info)
pkgs <- do.call(rbind, pkgs)
head(pkgs)
tail(pkgs)
pkgs <- head(pkgs, -2) # Remove r-devel and r-future
# Extract major release information
major_releases <- pkgs[grep("\\.0", pkgs$version), ]
#
library(ggplot2)
p <- ggplot(pkgs, aes(x = date, y = pkgs)) +
geom_smooth() +
geom_point() +
geom_rug(colour = "grey50") +
geom_vline(data = major_releases,
aes(xintercept = as.numeric(date)),
colour = "grey80") +
geom_text(data = major_releases,
aes(label = paste("Version", version), y = 8000),
angle = 90,
colour = "red",
hjust = 1, vjust = -1) +
theme_minimal(16) +
ggtitle("Number of CRAN packages per R version") +
xlab(NULL) +
ylab(NULL)
print(p)
version date pkgs
1.7 2004-10-01 235
1.8 2004-10-01 290
1.9 2004-12-11 390
2.0 2005-03-07 464
2.1 2005-11-09 586
2.2 2006-05-30 701
2.3 2006-10-05 820
2.4 2007-06-26 1008
2.5 2007-11-23 1169
2.6 2008-06-25 1383
2.7 2008-12-14 1556
2.8 2009-06-25 1757
2.9 2010-01-28 2067
2.10 2010-10-08 2463
2.11 2011-03-31 2836
2.12 2011-10-30 3250
2.13 2012-02-28 3491
2.14 2013-03-31 4184
2.15 2014-03-31 5096
2.16 2015-03-16 6281
3.0 2015-03-16 6281
3.1 2016-03-31 7900
3.2 2016-03-31 8125
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment