Last active
June 29, 2019 02:06
-
-
Save benmarwick/8445158417ed4b6af219675615a97869 to your computer and use it in GitHub Desktop.
CRAN pkgs and tests
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(gh) | |
.token = "" # get one from https://github.com/settings/tokens | |
# get all pkgs on GitHub mirror of CRAN | |
n <- 5000 # 16125 # https://github.com/cran | |
cran_repos <- gh("/users/:username/repos", | |
username = "cran", | |
.limit = n, | |
.token = .token ) | |
# extract pkg names | |
cran_pkg_names <- vapply(cran_repos, "[[", "", "name") | |
cran_pkg_created <- vapply(cran_repos, "[[", "", "created_at") | |
# how many do we have? | |
length(cran_pkg_names) | |
# get tree for each pkg | |
gh_safely <- safely(gh) | |
pkg_trees <- | |
map(cran_pkg_names, | |
~gh_safely("GET /repos/:username/:repo/git/trees/master?recursive=1", | |
username = "cran", | |
repo = .x, | |
.token = .token)) | |
saveRDS(pkg_trees, "pkg_trees.rds") | |
saveRDS(cran_repos, "cran_repos.rds") | |
saveRDS(cran_pkg_names, "cran_pkg_names.rds") | |
rm(pkg_trees) | |
rm(cran_repos) | |
# how many pkgs have tests? | |
# how does test presence/absense/size relate to... | |
# pkg age/size/contributors/deps | |
# where do people put tests? toplevel tests/ or inst/ or what? | |
# what frameworks do they use? | |
# what testing fns do they use? expect_? are they just testing for correct class? budget! | |
# what is the frequency of testing taxa? | |
# related reading | |
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5500893/ | |
# https://yihui.name/en/2013/09/testing-r-packages/ | |
# http://www.win-vector.com/blog/2019/03/unit-tests-in-r/ | |
# RUnit (released June 2004) itself collects test suites from directories and then runs them, recording user assertions in a JUnit-inspired report. The idea is that once you have a bunch of tests you really want to track them some way. | |
# testthat (released November 2009) self-describes as integrating into a workflow. It runs tests found in the tests/testthat sub-directory (directory found relative to the package source, not relative to an installed package) and tracks user assertions. The related devtools/usethis package both writes a canonical test controlling file into the tests directory (allowing testthat to be triggered by "R CMD check"), and can also directly run tests. | |
# unitizer (released April 2017) bases its tests on comparisons of objects, rather than comparing text or requiring user assertions. It also aids in producing and updating reference objects. | |
# tinytest (pre-release) | |
# tests can be in test/ or inst/ | |
# https://www.mango-solutions.com/blog/analyzing-coverage-of-r-unit-tests-in-packages-the-testcoverage-package | |
# analysis using revdeps, "These numbers equate to around 8% of R packages on CRAN containing any kind of recognised test framework." | |
# http://r-pkgs.had.co.nz/tests.html | |
# https://journal.r-project.org/archive/2012/RJ-2012-018/RJ-2012-018.pdf | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
pkg_trees <- readRDS("pkg_trees.rds") | |
cran_pkg_names <- readRDS("cran_pkg_names.rds") | |
# drop the errors | |
good_ones <- transpose(pkg_trees)$result | |
# how many? | |
length(good_ones) | |
# get list of dataframes of files for each pkg | |
x <- map(good_ones, 3, 'tree') %>% | |
map(~map_df(.x, ~.x)) %>% | |
set_names(cran_pkg_names) | |
# keep only the files with 'test' in them | |
x_tests_dir <- | |
map(x, ~.x %>% filter(str_detect(path, "tests/"))) %>% | |
discard( ~ nrow(.x) == 0) | |
# how many have tests? | |
n_pkgs_with_tests <- length(x_tests_dir) | |
# what proportion of our sample? | |
n <- length(good_ones) | |
prop_with_tests <- n_pkgs_with_tests / n | |
perc_with_tests <- prop_with_tests * 100 | |
# rough size of tests per pkg | |
test_file_sizes <- | |
x_tests_dir %>% | |
map(~.x %>% filter(str_detect( path , "\\.R$|\\.r$"))) | |
size_of_tests_per_pkg <- | |
map_df(test_file_sizes, ~sum(.x$size, na.rm = TRUE)) %>% | |
gather(cran_pkg_names, test_size) %>% | |
left_join( | |
tibble(cran_pkg_names = cran_pkg_names, | |
pkg_size = map_int(x, ~sum(.x$size, na.rm = TRUE)))) %>% | |
mutate(test_size_ratio = test_size / pkg_size) | |
saveRDS(size_of_tests_per_pkg, "size_of_tests_per_pkg.rds") | |
size_of_tests_per_pkg_hist_plot <- | |
ggplot(size_of_tests_per_pkg, | |
aes(test_size_ratio)) + | |
geom_histogram() + | |
labs(x = "Ratio of test files size to total pkg size", | |
y = "Count") + | |
ggtitle("Tests in R pkgs on CRAN: How much of the pkg are tests?", | |
subtitle = str_glue('{n_pkgs_with_tests} pkgs that have tests, out of a sample of {n} pkg ({perc_with_tests}%)')) + | |
theme_minimal(base_size = 14) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pkg metadata from CRAN ------------------------------------ | |
# https://rviews.rstudio.com/2018/03/08/cran-package-metadata/ | |
# https://www.rstudio.com/resources/videos/what-makes-a-great-r-package/ | |
pkg_DESC <- tools:::CRAN_package_db() | |
meta_data <- pkg_DESC[, c(1,4,5,17,37,60,61)] | |
names(meta_data) <- c("Package","Dep","Imp","Aut","Date","RD","RI") | |
library(stringr) | |
# A helper function to unlist and string split. | |
fcn <- function(x,y){ | |
x <- unlist(x) %>% strsplit(",") | |
y <- unlist(y) %>% strsplit(",") | |
z <- unlist(na.omit(union(x, y))) | |
} | |
meta_data <- mutate(meta_data, | |
DepImp = mapply(fcn,Dep,Imp), | |
RDRI = mapply(fcn,RD,RI)) | |
# CLEAN THE AUTHOR'S FIELD | |
# Function to remove all text between two brackets | |
# http://bit.ly/2mE7TNJ | |
clean <- function(x){ | |
gsub("\\[[^]]*]", "",x) | |
} | |
# Function to remove line breaks | |
# http://bit.ly/2B0n4VS | |
clean2 <- function(x){ | |
gsub("[\r\n]", "", x) | |
} | |
# Clean Author's field | |
meta_data$Aut <- meta_data$Aut %>% map(clean) %>% map(clean2) | |
rm_na <- function(x){ | |
list(na.omit(unlist(x))) | |
} | |
# Process the fields Aut, Dep, Imp, RD, RI | |
c_dat1 <- seq_len(nrow(meta_data)) %>% | |
map_df(~{ | |
meta_data[.x, ] %>% | |
select(-Package,-DepImp,-RDRI) %>% | |
map_df(~ifelse(is.na(.x), 0, length(str_split(.x, ",")[[1]]))) %>% | |
mutate(Package = meta_data$Package[.x]) | |
}) %>% | |
select(Package, Aut, Dep, Imp, RD, RI) | |
# head(c_dat1) | |
# Process the fields DepImp RDRI | |
c_dat2 <- seq_len(nrow(meta_data)) %>% | |
map_df(~{ | |
meta_data[.x, ] %>% | |
select(-Package, -Aut, -Dep, -Imp, -RD, -RI, -Date) %>% | |
map_df(~ifelse(is.na(.x), 0, length(rm_na(.x)[[1]]))) | |
}) %>% | |
select(DepImp, RDRI) | |
# head(c_dat2) | |
c_dat <- bind_cols(c_dat1, c_dat2, date = meta_data$Date) | |
head(c_dat) | |
saveRDS(c_dat, "c_dat.rds") | |
ss <- function(x){ | |
avg <- round(mean(x),digits=2) | |
std <- round(sd(x),digits=2) | |
med <- median(x) | |
res <- list(mean = avg, sd = std, median = med) | |
} | |
res <- cbind(names(c_dat[,-c(1,9)]), map_df(c_dat[,-c(1,9)],ss)) | |
names(res) <- c("Features", "mean", "sd", "median") | |
res | |
quantile(c_dat$RDRI) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(lubridate) | |
size_of_tests_per_pkg <- readRDS("size_of_tests_per_pkg.rds") | |
c_dat <- readRDS("c_dat.rds") | |
pkg_test_size_desc <- | |
left_join(size_of_tests_per_pkg, | |
c_dat, by = c("cran_pkg_names" = "Package")) %>% | |
mutate(date_fmt = ymd(date)) %>% | |
mutate(year = round_date(date_fmt, unit = "year")) | |
saveRDS(pkg_test_size_desc, "pkg_test_size_desc.rds") | |
# test ratio and Aut/RD/Date/pkg_size | |
test_ratio_vs_various_facet_plot <- | |
pkg_test_size_desc %>% | |
select(test_size_ratio, | |
Aut, | |
RDRI, | |
pkg_size) %>% | |
gather(variable, | |
value, | |
-test_size_ratio) %>% | |
ggplot(aes(test_size_ratio, | |
value)) + | |
geom_point() + | |
scale_y_log10() + | |
facet_wrap( ~ variable, | |
scales = "free") + | |
theme_minimal(base_size = 14) | |
library(cowplot) | |
plot_grid(size_of_tests_per_pkg_hist_plot, | |
test_ratio_vs_various_facet_plot, | |
ncol = 1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# get pkg dates from here | |
library(rvest) | |
library(tidyverse) | |
library(lubridate) | |
library(ggbeeswarm) | |
# we need to look at the CRAN archive page for every pkg to get the first date | |
cran_pkg_names <- readRDS("cran_pkg_names.rds") | |
archive_url <- "https://cran.r-project.org/src/contrib/Archive/" | |
archive_urls <- map_chr(cran_pkg_names, ~str_glue('{archive_url}{.x}')) | |
get_archive_tables_safely <- | |
safely(function(x) read_html(x) %>% | |
html_nodes("table") %>% | |
html_table() %>% | |
.[[1]]) | |
archive_tbls_lst <- | |
map(archive_urls[1:1000], ~get_archive_tables_safely(.x)) | |
archive_tbls_lst_result <- | |
transpose(archive_tbls_lst)$result | |
#------------------------------- | |
the_url <- "https://cran.r-project.org/web/packages/available_packages_by_date.html" | |
cran_dates <- read_html(the_url) | |
cran_dates_tbl <- | |
cran_dates %>% | |
html_nodes("table") %>% | |
html_table() | |
cran_dates_tbl <- cran_dates_tbl[[1]] | |
saveRDS(cran_dates_tbl, "cran_dates_tbl.rds") | |
cran_dates_tbl <- readRDS("cran_dates_tbl.rds") | |
pkg_test_size_desc <- readRDS("pkg_test_size_desc.rds") | |
pkg_test_size_desc_birth_dates <- | |
left_join(cran_dates_tbl, | |
pkg_test_size_desc, | |
by = c("Package" = "cran_pkg_names")) %>% | |
mutate(birth_date = ymd(Date)) %>% | |
mutate(birth_year = round_date(birth_date, unit = "year")) %>% | |
mutate(birth_year = str_trunc(as.character(birth_year), 4, side = "right", ellipsis = "")) | |
ggplot(pkg_test_size_desc_birth_dates, | |
aes(birth_year, | |
test_size_ratio)) + | |
geom_boxplot() + | |
geom_quasirandom(alpha = 0.2) + | |
labs(y = "Ratio of test file size to pkg size", | |
x = "Year of pkg origin") + | |
scale_y_log10() + | |
theme_minimal(base_size = 14) | |
summary(pkg_test_size_desc_birth_dates) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# download all github.com/cran repos | |
library(tidyverse) | |
library(git2r) | |
library(gh) | |
.token = "" # get one from https://github.com/settings/tokens | |
# get all pkgs on GitHub mirror of CRAN | |
n <- 16125 # 16125 # https://github.com/cran | |
cran_repos <- gh("/users/:username/repos", | |
username = "cran", | |
.limit = n, | |
.token = .token ) | |
# extract pkg names | |
cran_pkg_names <- vapply(cran_repos, "[[", "", "name") | |
# download source code of each pkg | |
map(cran_pkg_names[14786:n], ~clone(str_glue("https://github.com/cran/{.x}"), | |
local_path = .x)) | |
# error checking after a few false startss | |
match("TeXCheckR", cran_pkg_names ) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(ctv) | |
library(tidyverse) | |
library(pak) | |
library(lubridate) | |
library(ggridges) | |
conflicted::conflict_prefer("filter", "dplyr") | |
# get view metadata | |
tv <- available.views() # this takes a while | |
# package list | |
dat <- tv %>% | |
transpose() | |
# tag with taskview identifier and bind in one df | |
taskview_pkg_id <- dat %>% | |
as_tibble() %>% | |
mutate(pkg = map2(name, packagelist, .f = function(x, y){ | |
y %>% | |
mutate(taskview = x, | |
no_pkgs_in_tv = nrow(y)) | |
})) %>% | |
pluck("pkg") %>% | |
bind_rows() %>% | |
rename(taskview_core = core) | |
# for each pkg in ctv, download the | |
# src from github.com/cran ... | |
# first apt-get install a bunch of things... | |
library(git2r) | |
library(gh) | |
safe_clone <- safely(git2r::clone) | |
safe_local_install <- safely(pak::pkg_install) | |
safe_covr_check <- safely(covr::package_coverage) | |
safe_remove <- safely(pak::pkg_remove) | |
n <- length(taskview_pkg_id$name) | |
covr_results <- vector("list", length = n ) | |
for(i in 1:n){ | |
# clone from github | |
message(paste0("Now we are cloning ", taskview_pkg_id$name[i])) | |
safe_clone(str_glue("https://github.com/cran/{taskview_pkg_id$name[i]}"), | |
local_path = taskview_pkg_id$name[i]) | |
# install from source, need to do this to get deps | |
message(paste0("Now we are installing ", taskview_pkg_id$name[i])) | |
safe_local_install(taskview_pkg_id$name[i], | |
ask = FALSE) | |
message(paste0("Now we are running covr on ", taskview_pkg_id$name[i])) | |
covr_results[[i]] <- | |
safe_covr_check(taskview_pkg_id$name[i]) | |
safe_remove(taskview_pkg_id$name[i]) | |
unlink(taskview_pkg_id$name[i], recursive = TRUE ) | |
unlink( str_subset (list.files(path = "/usr/local/lib/R/site-library//", | |
full.names = TRUE), | |
"00LOCK"), | |
force = TRUE, | |
recursive = TRUE) | |
message(paste0("Now we have completed ", i, " of ", n, " packages")) | |
gc() | |
} | |
names(covr_results) <- taskview_pkg_id$name | |
covr_results_tr <- | |
purrr::transpose(covr_results)$result | |
saveRDS(covr_results_tr, "covr_results_tr.rds") | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sudo apt install docker.io | |
sudo docker run -e PASSWORD=ben -e ROOT=TRUE --rm -p 8787:8787 -m=6G --memory-swap=-1 rocker/verse | |
sudo apt-get update | |
sudo apt-get install jags libgsl-dev gtk2.0 xvfb libcairo2-dev xorg openbox xauth -y --no-install-recommends libx11-6 libxss1 libxt6 libxext6 libsm6 libice6 xdg-utils r-cran-cairodevice r-cran-rgtk2 libatk1.0-dev libpango1.0-dev libgtk2.0-dev libglib2.0-dev libcurl4-openssl-dev libgdal-dev libproj-dev | |
http://localhost:8787/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment