Skip to content

Instantly share code, notes, and snippets.

@arraytools
Last active March 4, 2019 15:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save arraytools/f037a4fd90ae8fe8e2896db6fe5b7fad to your computer and use it in GitHub Desktop.
Save arraytools/f037a4fd90ae8fe8e2896db6fe5b7fad to your computer and use it in GitHub Desktop.
Get top downloaded R packages from a certain time period
## ======================================================================
## Step 1: Download all log files
## ======================================================================
# Here's an easy way to get all the URLs in R
start <- as.Date('2018-07-01')
today <- as.Date('2018-12-31')
all_days <- seq(start, today, by = 'day')
year <- as.POSIXlt(all_days)$year + 1900
urls <- paste0('http://cran-logs.rstudio.com/', year, '/', all_days, '.csv.gz')
# only download the files you don't have:
missing_days <- setdiff(as.character(all_days), tools::file_path_sans_ext(dir("CRANlogs"), TRUE))
dir.create("CRANlogs")
for (i in 1:length(missing_days)) {
print(paste0(i, "/", length(missing_days)))
download.file(urls[i], paste0('CRANlogs/', missing_days[i], '.csv.gz'))
}
# Example of the first 3 rows
# date time size r_version r_arch r_os package version country ip_id
# 1 2018-08-04 02:09:18 66135 3.4.4 x86_64 linux-gnu desc 1.2.0 HK 1
# 2 2018-08-04 02:08:44 15772 3.5.1 x86_64 darwin15.6.0 bindr 0.1.1 CA 2
# 3 2018-08-04 02:09:18 2703640 3.4.4 x86_64 linux-gnu igraph 1.2.2 HK 1
## ======================================================================
## Step 2: Load single data files into one big data.table
## ======================================================================
file_list <- list.files("CRANlogs", full.names=TRUE)
ind <- sapply(all_days, function(x) grep(x, file_list))
file_list <- file_list[ind]
# This takes time too.
# It should be replaced by a parallel call
# logs <- list()
dat <- NULL
stime <- Sys.time()
for (file in file_list) {
print(paste("Reading", file, "..."))
logs <- read.table(file, header = TRUE, sep = ",", quote = "\"",
dec = ".", fill = TRUE, comment.char = "", as.is=TRUE)
dat <- rbind(dat, logs[, c("date", "package", "version", "ip_id")])
}
cat(Sys.time() - stime, " is taken\n") # 37 minutes before adding dat object
length(unique(dat[, "date"])
# [1] 184
dim(logs[[1]])
# [1] 743833 10
# rbind together all files
dim(dat)
# [1] 369172793 4
object.size(dat)/2^20 # 9859.2 bytes
save(dat, file="CRANlogs/CRANlogs.RData")
write.table(dat, file = "CRANlogs/CRANlogs.txt", sep="\t", quote = F, row.names = F)
# for later analyses: load the saved data.table
# load("CRANlogs/CRANlogs.RData")
## ======================================================================
## Step 3: Show top packages and the download statistics from interested
## packages
## ======================================================================
library(plyr)
str(dat)
sp <- split(dat, dat$package)
ndownloads <- sapply(sp, function(x) nrow(x))
str(ndownloads)
# Named int [1:15535] 5104 56 3755 45379 3034 44966 2421 89 3331 2491 ...
# - attr(*, "names")= chr [1:15535] "A3" "aaMI" "abbyyR" "abc" ...
object.size(ndownloads)/2^10
# 974.7 bytes
save(ndownloads, file="CRANlogs/ndownloads.RData")
# Packages related to generating C-statistic for survival data
pkg <- c("survival", "survC1", "survAUC", "survcomp", "Hmisc", "pec")
ndownloads[pkg]
# survival survC1 survAUC survcomp Hmisc pec
# 634511 2802 10724 127 759760 18136
# Packages related to big data
pkg <- c("ff", "ffbase", "bigmemory", "biganalytics", "bigtabulate")
ndownloads[pkg]
# ff ffbase bigmemory biganalytics bigtabulate
# 86494 61002 66892 5761 2866
as.matrix(sort(ndownloads, decreasing = T)[1:50])
# [,1]
# tidyverse 13633648
# Rcpp 4693972
# rlang 4131855
# ggplot2 3915127
# stringi 3612365
# glue 3328441
# digest 3275212
# dplyr 3256666
# stringr 3170106
# pillar 3042329
# tibble 2977546
# fansi 2838908
# utf8 2808983
# cli 2799004
# R6 2774427
# crayon 2686846
# yaml 2679545
# scales 2586105
# magrittr 2578818
# assertthat 2528465
# curl 2503656
# reshape2 2422822
# jsonlite 2421323
# BH 2402118
# plyr 2379406
# withr 2333785
# mime 2325869
# pkgconfig 2290113
# htmltools 2285338
# knitr 2228660
# purrr 2226828
# munsell 2225851
# data.table 2220833
# lazyeval 2210901
# tidyselect 2154770
# evaluate 2109627
# base64enc 2070405
# RColorBrewer 2065863
# colorspace 2036735
# markdown 2000513
# bindrcpp 1995371
# viridisLite 1979898
# openssl 1951288
# plogr 1940196
# httr 1913727
# backports 1859015
# gtable 1857385
# highr 1832832
# tidyr 1828262
# readr 1827858
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment