Skip to content

Instantly share code, notes, and snippets.

@benmarwick
Last active June 29, 2019 02:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save benmarwick/8445158417ed4b6af219675615a97869 to your computer and use it in GitHub Desktop.
Save benmarwick/8445158417ed4b6af219675615a97869 to your computer and use it in GitHub Desktop.
CRAN pkgs and tests
library(tidyverse)
library(gh)
.token = "" # get one from https://github.com/settings/tokens
# get all pkgs on GitHub mirror of CRAN
n <- 5000 # 16125 # https://github.com/cran
cran_repos <- gh("/users/:username/repos",
username = "cran",
.limit = n,
.token = .token )
# extract pkg names
cran_pkg_names <- vapply(cran_repos, "[[", "", "name")
cran_pkg_created <- vapply(cran_repos, "[[", "", "created_at")
# how many do we have?
length(cran_pkg_names)
# get tree for each pkg
gh_safely <- safely(gh)
pkg_trees <-
map(cran_pkg_names,
~gh_safely("GET /repos/:username/:repo/git/trees/master?recursive=1",
username = "cran",
repo = .x,
.token = .token))
saveRDS(pkg_trees, "pkg_trees.rds")
saveRDS(cran_repos, "cran_repos.rds")
saveRDS(cran_pkg_names, "cran_pkg_names.rds")
rm(pkg_trees)
rm(cran_repos)
# how many pkgs have tests?
# how does test presence/absense/size relate to...
# pkg age/size/contributors/deps
# where do people put tests? toplevel tests/ or inst/ or what?
# what frameworks do they use?
# what testing fns do they use? expect_? are they just testing for correct class? budget!
# what is the frequency of testing taxa?
# related reading
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5500893/
# https://yihui.name/en/2013/09/testing-r-packages/
# http://www.win-vector.com/blog/2019/03/unit-tests-in-r/
# RUnit (released June 2004) itself collects test suites from directories and then runs them, recording user assertions in a JUnit-inspired report. The idea is that once you have a bunch of tests you really want to track them some way.
# testthat (released November 2009) self-describes as integrating into a workflow. It runs tests found in the tests/testthat sub-directory (directory found relative to the package source, not relative to an installed package) and tracks user assertions. The related devtools/usethis package both writes a canonical test controlling file into the tests directory (allowing testthat to be triggered by "R CMD check"), and can also directly run tests.
# unitizer (released April 2017) bases its tests on comparisons of objects, rather than comparing text or requiring user assertions. It also aids in producing and updating reference objects.
# tinytest (pre-release)
# tests can be in test/ or inst/
# https://www.mango-solutions.com/blog/analyzing-coverage-of-r-unit-tests-in-packages-the-testcoverage-package
# analysis using revdeps, "These numbers equate to around 8% of R packages on CRAN containing any kind of recognised test framework."
# http://r-pkgs.had.co.nz/tests.html
# https://journal.r-project.org/archive/2012/RJ-2012-018/RJ-2012-018.pdf
library(tidyverse)
pkg_trees <- readRDS("pkg_trees.rds")
cran_pkg_names <- readRDS("cran_pkg_names.rds")
# drop the errors
good_ones <- transpose(pkg_trees)$result
# how many?
length(good_ones)
# get list of dataframes of files for each pkg
x <- map(good_ones, 3, 'tree') %>%
map(~map_df(.x, ~.x)) %>%
set_names(cran_pkg_names)
# keep only the files with 'test' in them
x_tests_dir <-
map(x, ~.x %>% filter(str_detect(path, "tests/"))) %>%
discard( ~ nrow(.x) == 0)
# how many have tests?
n_pkgs_with_tests <- length(x_tests_dir)
# what proportion of our sample?
n <- length(good_ones)
prop_with_tests <- n_pkgs_with_tests / n
perc_with_tests <- prop_with_tests * 100
# rough size of tests per pkg
test_file_sizes <-
x_tests_dir %>%
map(~.x %>% filter(str_detect( path , "\\.R$|\\.r$")))
size_of_tests_per_pkg <-
map_df(test_file_sizes, ~sum(.x$size, na.rm = TRUE)) %>%
gather(cran_pkg_names, test_size) %>%
left_join(
tibble(cran_pkg_names = cran_pkg_names,
pkg_size = map_int(x, ~sum(.x$size, na.rm = TRUE)))) %>%
mutate(test_size_ratio = test_size / pkg_size)
saveRDS(size_of_tests_per_pkg, "size_of_tests_per_pkg.rds")
size_of_tests_per_pkg_hist_plot <-
ggplot(size_of_tests_per_pkg,
aes(test_size_ratio)) +
geom_histogram() +
labs(x = "Ratio of test files size to total pkg size",
y = "Count") +
ggtitle("Tests in R pkgs on CRAN: How much of the pkg are tests?",
subtitle = str_glue('{n_pkgs_with_tests} pkgs that have tests, out of a sample of {n} pkg ({perc_with_tests}%)')) +
theme_minimal(base_size = 14)
# pkg metadata from CRAN ------------------------------------
# https://rviews.rstudio.com/2018/03/08/cran-package-metadata/
# https://www.rstudio.com/resources/videos/what-makes-a-great-r-package/
pkg_DESC <- tools:::CRAN_package_db()
meta_data <- pkg_DESC[, c(1,4,5,17,37,60,61)]
names(meta_data) <- c("Package","Dep","Imp","Aut","Date","RD","RI")
library(stringr)
# A helper function to unlist and string split.
fcn <- function(x,y){
x <- unlist(x) %>% strsplit(",")
y <- unlist(y) %>% strsplit(",")
z <- unlist(na.omit(union(x, y)))
}
meta_data <- mutate(meta_data,
DepImp = mapply(fcn,Dep,Imp),
RDRI = mapply(fcn,RD,RI))
# CLEAN THE AUTHOR'S FIELD
# Function to remove all text between two brackets
# http://bit.ly/2mE7TNJ
clean <- function(x){
gsub("\\[[^]]*]", "",x)
}
# Function to remove line breaks
# http://bit.ly/2B0n4VS
clean2 <- function(x){
gsub("[\r\n]", "", x)
}
# Clean Author's field
meta_data$Aut <- meta_data$Aut %>% map(clean) %>% map(clean2)
rm_na <- function(x){
list(na.omit(unlist(x)))
}
# Process the fields Aut, Dep, Imp, RD, RI
c_dat1 <- seq_len(nrow(meta_data)) %>%
map_df(~{
meta_data[.x, ] %>%
select(-Package,-DepImp,-RDRI) %>%
map_df(~ifelse(is.na(.x), 0, length(str_split(.x, ",")[[1]]))) %>%
mutate(Package = meta_data$Package[.x])
}) %>%
select(Package, Aut, Dep, Imp, RD, RI)
# head(c_dat1)
# Process the fields DepImp RDRI
c_dat2 <- seq_len(nrow(meta_data)) %>%
map_df(~{
meta_data[.x, ] %>%
select(-Package, -Aut, -Dep, -Imp, -RD, -RI, -Date) %>%
map_df(~ifelse(is.na(.x), 0, length(rm_na(.x)[[1]])))
}) %>%
select(DepImp, RDRI)
# head(c_dat2)
c_dat <- bind_cols(c_dat1, c_dat2, date = meta_data$Date)
head(c_dat)
saveRDS(c_dat, "c_dat.rds")
ss <- function(x){
avg <- round(mean(x),digits=2)
std <- round(sd(x),digits=2)
med <- median(x)
res <- list(mean = avg, sd = std, median = med)
}
res <- cbind(names(c_dat[,-c(1,9)]), map_df(c_dat[,-c(1,9)],ss))
names(res) <- c("Features", "mean", "sd", "median")
res
quantile(c_dat$RDRI)
library(tidyverse)
library(lubridate)
size_of_tests_per_pkg <- readRDS("size_of_tests_per_pkg.rds")
c_dat <- readRDS("c_dat.rds")
pkg_test_size_desc <-
left_join(size_of_tests_per_pkg,
c_dat, by = c("cran_pkg_names" = "Package")) %>%
mutate(date_fmt = ymd(date)) %>%
mutate(year = round_date(date_fmt, unit = "year"))
saveRDS(pkg_test_size_desc, "pkg_test_size_desc.rds")
# test ratio and Aut/RD/Date/pkg_size
test_ratio_vs_various_facet_plot <-
pkg_test_size_desc %>%
select(test_size_ratio,
Aut,
RDRI,
pkg_size) %>%
gather(variable,
value,
-test_size_ratio) %>%
ggplot(aes(test_size_ratio,
value)) +
geom_point() +
scale_y_log10() +
facet_wrap( ~ variable,
scales = "free") +
theme_minimal(base_size = 14)
library(cowplot)
plot_grid(size_of_tests_per_pkg_hist_plot,
test_ratio_vs_various_facet_plot,
ncol = 1)
# get pkg dates from here
library(rvest)
library(tidyverse)
library(lubridate)
library(ggbeeswarm)
# we need to look at the CRAN archive page for every pkg to get the first date
cran_pkg_names <- readRDS("cran_pkg_names.rds")
archive_url <- "https://cran.r-project.org/src/contrib/Archive/"
archive_urls <- map_chr(cran_pkg_names, ~str_glue('{archive_url}{.x}'))
get_archive_tables_safely <-
safely(function(x) read_html(x) %>%
html_nodes("table") %>%
html_table() %>%
.[[1]])
archive_tbls_lst <-
map(archive_urls[1:1000], ~get_archive_tables_safely(.x))
archive_tbls_lst_result <-
transpose(archive_tbls_lst)$result
#-------------------------------
the_url <- "https://cran.r-project.org/web/packages/available_packages_by_date.html"
cran_dates <- read_html(the_url)
cran_dates_tbl <-
cran_dates %>%
html_nodes("table") %>%
html_table()
cran_dates_tbl <- cran_dates_tbl[[1]]
saveRDS(cran_dates_tbl, "cran_dates_tbl.rds")
cran_dates_tbl <- readRDS("cran_dates_tbl.rds")
pkg_test_size_desc <- readRDS("pkg_test_size_desc.rds")
pkg_test_size_desc_birth_dates <-
left_join(cran_dates_tbl,
pkg_test_size_desc,
by = c("Package" = "cran_pkg_names")) %>%
mutate(birth_date = ymd(Date)) %>%
mutate(birth_year = round_date(birth_date, unit = "year")) %>%
mutate(birth_year = str_trunc(as.character(birth_year), 4, side = "right", ellipsis = ""))
ggplot(pkg_test_size_desc_birth_dates,
aes(birth_year,
test_size_ratio)) +
geom_boxplot() +
geom_quasirandom(alpha = 0.2) +
labs(y = "Ratio of test file size to pkg size",
x = "Year of pkg origin") +
scale_y_log10() +
theme_minimal(base_size = 14)
summary(pkg_test_size_desc_birth_dates)
# download all github.com/cran repos
library(tidyverse)
library(git2r)
library(gh)
.token = "" # get one from https://github.com/settings/tokens
# get all pkgs on GitHub mirror of CRAN
n <- 16125 # 16125 # https://github.com/cran
cran_repos <- gh("/users/:username/repos",
username = "cran",
.limit = n,
.token = .token )
# extract pkg names
cran_pkg_names <- vapply(cran_repos, "[[", "", "name")
# download source code of each pkg
map(cran_pkg_names[14786:n], ~clone(str_glue("https://github.com/cran/{.x}"),
local_path = .x))
# error checking after a few false startss
match("TeXCheckR", cran_pkg_names )
library(ctv)
library(tidyverse)
library(pak)
library(lubridate)
library(ggridges)
conflicted::conflict_prefer("filter", "dplyr")
# get view metadata
tv <- available.views() # this takes a while
# package list
dat <- tv %>%
transpose()
# tag with taskview identifier and bind in one df
taskview_pkg_id <- dat %>%
as_tibble() %>%
mutate(pkg = map2(name, packagelist, .f = function(x, y){
y %>%
mutate(taskview = x,
no_pkgs_in_tv = nrow(y))
})) %>%
pluck("pkg") %>%
bind_rows() %>%
rename(taskview_core = core)
# for each pkg in ctv, download the
# src from github.com/cran ...
# first apt-get install a bunch of things...
library(git2r)
library(gh)
safe_clone <- safely(git2r::clone)
safe_local_install <- safely(pak::pkg_install)
safe_covr_check <- safely(covr::package_coverage)
safe_remove <- safely(pak::pkg_remove)
n <- length(taskview_pkg_id$name)
covr_results <- vector("list", length = n )
for(i in 1:n){
# clone from github
message(paste0("Now we are cloning ", taskview_pkg_id$name[i]))
safe_clone(str_glue("https://github.com/cran/{taskview_pkg_id$name[i]}"),
local_path = taskview_pkg_id$name[i])
# install from source, need to do this to get deps
message(paste0("Now we are installing ", taskview_pkg_id$name[i]))
safe_local_install(taskview_pkg_id$name[i],
ask = FALSE)
message(paste0("Now we are running covr on ", taskview_pkg_id$name[i]))
covr_results[[i]] <-
safe_covr_check(taskview_pkg_id$name[i])
safe_remove(taskview_pkg_id$name[i])
unlink(taskview_pkg_id$name[i], recursive = TRUE )
unlink( str_subset (list.files(path = "/usr/local/lib/R/site-library//",
full.names = TRUE),
"00LOCK"),
force = TRUE,
recursive = TRUE)
message(paste0("Now we have completed ", i, " of ", n, " packages"))
gc()
}
names(covr_results) <- taskview_pkg_id$name
covr_results_tr <-
purrr::transpose(covr_results)$result
saveRDS(covr_results_tr, "covr_results_tr.rds")
sudo apt install docker.io
sudo docker run -e PASSWORD=ben -e ROOT=TRUE --rm -p 8787:8787 -m=6G --memory-swap=-1 rocker/verse
sudo apt-get update
sudo apt-get install jags libgsl-dev gtk2.0 xvfb libcairo2-dev xorg openbox xauth -y --no-install-recommends libx11-6 libxss1 libxt6 libxext6 libsm6 libice6 xdg-utils r-cran-cairodevice r-cran-rgtk2 libatk1.0-dev libpango1.0-dev libgtk2.0-dev libglib2.0-dev libcurl4-openssl-dev libgdal-dev libproj-dev
http://localhost:8787/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment