Last active
March 4, 2019 15:01
-
-
Save arraytools/f037a4fd90ae8fe8e2896db6fe5b7fad to your computer and use it in GitHub Desktop.
Get top downloaded R packages from a certain time period
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## ====================================================================== | |
## Step 1: Download all log files | |
## ====================================================================== | |
# Here's an easy way to get all the URLs in R | |
start <- as.Date('2018-07-01') | |
today <- as.Date('2018-12-31') | |
all_days <- seq(start, today, by = 'day') | |
year <- as.POSIXlt(all_days)$year + 1900 | |
urls <- paste0('http://cran-logs.rstudio.com/', year, '/', all_days, '.csv.gz') | |
# only download the files you don't have: | |
missing_days <- setdiff(as.character(all_days), tools::file_path_sans_ext(dir("CRANlogs"), TRUE)) | |
dir.create("CRANlogs") | |
for (i in 1:length(missing_days)) { | |
print(paste0(i, "/", length(missing_days))) | |
download.file(urls[i], paste0('CRANlogs/', missing_days[i], '.csv.gz')) | |
} | |
# Example of the first 3 rows | |
# date time size r_version r_arch r_os package version country ip_id | |
# 1 2018-08-04 02:09:18 66135 3.4.4 x86_64 linux-gnu desc 1.2.0 HK 1 | |
# 2 2018-08-04 02:08:44 15772 3.5.1 x86_64 darwin15.6.0 bindr 0.1.1 CA 2 | |
# 3 2018-08-04 02:09:18 2703640 3.4.4 x86_64 linux-gnu igraph 1.2.2 HK 1 | |
## ====================================================================== | |
## Step 2: Load single data files into one big data.table | |
## ====================================================================== | |
file_list <- list.files("CRANlogs", full.names=TRUE) | |
ind <- sapply(all_days, function(x) grep(x, file_list)) | |
file_list <- file_list[ind] | |
# This takes time too. | |
# It should be replaced by a parallel call | |
# logs <- list() | |
dat <- NULL | |
stime <- Sys.time() | |
for (file in file_list) { | |
print(paste("Reading", file, "...")) | |
logs <- read.table(file, header = TRUE, sep = ",", quote = "\"", | |
dec = ".", fill = TRUE, comment.char = "", as.is=TRUE) | |
dat <- rbind(dat, logs[, c("date", "package", "version", "ip_id")]) | |
} | |
cat(Sys.time() - stime, " is taken\n") # 37 minutes before adding dat object | |
length(unique(dat[, "date"]) | |
# [1] 184 | |
dim(logs[[1]]) | |
# [1] 743833 10 | |
# rbind together all files | |
dim(dat) | |
# [1] 369172793 4 | |
object.size(dat)/2^20 # 9859.2 bytes | |
save(dat, file="CRANlogs/CRANlogs.RData") | |
write.table(dat, file = "CRANlogs/CRANlogs.txt", sep="\t", quote = F, row.names = F) | |
# for later analyses: load the saved data.table | |
# load("CRANlogs/CRANlogs.RData") | |
## ====================================================================== | |
## Step 3: Show top packages and the download statistics from interested | |
## packages | |
## ====================================================================== | |
library(plyr) | |
str(dat) | |
sp <- split(dat, dat$package) | |
ndownloads <- sapply(sp, function(x) nrow(x)) | |
str(ndownloads) | |
# Named int [1:15535] 5104 56 3755 45379 3034 44966 2421 89 3331 2491 ... | |
# - attr(*, "names")= chr [1:15535] "A3" "aaMI" "abbyyR" "abc" ... | |
object.size(ndownloads)/2^10 | |
# 974.7 bytes | |
save(ndownloads, file="CRANlogs/ndownloads.RData") | |
# Packages related to generating C-statistic for survival data | |
pkg <- c("survival", "survC1", "survAUC", "survcomp", "Hmisc", "pec") | |
ndownloads[pkg] | |
# survival survC1 survAUC survcomp Hmisc pec | |
# 634511 2802 10724 127 759760 18136 | |
# Packages related to big data | |
pkg <- c("ff", "ffbase", "bigmemory", "biganalytics", "bigtabulate") | |
ndownloads[pkg] | |
# ff ffbase bigmemory biganalytics bigtabulate | |
# 86494 61002 66892 5761 2866 | |
as.matrix(sort(ndownloads, decreasing = T)[1:50]) | |
# [,1] | |
# tidyverse 13633648 | |
# Rcpp 4693972 | |
# rlang 4131855 | |
# ggplot2 3915127 | |
# stringi 3612365 | |
# glue 3328441 | |
# digest 3275212 | |
# dplyr 3256666 | |
# stringr 3170106 | |
# pillar 3042329 | |
# tibble 2977546 | |
# fansi 2838908 | |
# utf8 2808983 | |
# cli 2799004 | |
# R6 2774427 | |
# crayon 2686846 | |
# yaml 2679545 | |
# scales 2586105 | |
# magrittr 2578818 | |
# assertthat 2528465 | |
# curl 2503656 | |
# reshape2 2422822 | |
# jsonlite 2421323 | |
# BH 2402118 | |
# plyr 2379406 | |
# withr 2333785 | |
# mime 2325869 | |
# pkgconfig 2290113 | |
# htmltools 2285338 | |
# knitr 2228660 | |
# purrr 2226828 | |
# munsell 2225851 | |
# data.table 2220833 | |
# lazyeval 2210901 | |
# tidyselect 2154770 | |
# evaluate 2109627 | |
# base64enc 2070405 | |
# RColorBrewer 2065863 | |
# colorspace 2036735 | |
# markdown 2000513 | |
# bindrcpp 1995371 | |
# viridisLite 1979898 | |
# openssl 1951288 | |
# plogr 1940196 | |
# httr 1913727 | |
# backports 1859015 | |
# gtable 1857385 | |
# highr 1832832 | |
# tidyr 1828262 | |
# readr 1827858 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment