benmarwick/001-get-pkg-data-from-cran-github-mirror.R

## 001-get-pkg-data-from-cran-github-mirror.R

library(tidyverse)
library(gh)

.token = ""  # get one from https://github.com/settings/tokens

# get all pkgs on GitHub mirror of CRAN
n <-  5000 # 16125 # https://github.com/cran
cran_repos <- gh("/users/:username/repos",
                 username = "cran",
                 .limit = n,
                 .token = .token )

# extract pkg names
cran_pkg_names <- vapply(cran_repos, "[[", "", "name")
cran_pkg_created <- vapply(cran_repos, "[[", "", "created_at")

# how many do we have?
length(cran_pkg_names)

# get tree for each pkg

gh_safely <- safely(gh)

pkg_trees <-
  map(cran_pkg_names,
      ~gh_safely("GET /repos/:username/:repo/git/trees/master?recursive=1",
                 username = "cran",
                 repo = .x,
                 .token = .token))

saveRDS(pkg_trees, "pkg_trees.rds")
saveRDS(cran_repos, "cran_repos.rds")
saveRDS(cran_pkg_names, "cran_pkg_names.rds")
rm(pkg_trees)
rm(cran_repos)


# how many pkgs have tests?
# how does test presence/absense/size relate to...
# pkg age/size/contributors/deps
# where do people put tests? toplevel tests/ or inst/ or what?
# what frameworks do they use?
# what testing fns do they use? expect_? are they just testing for correct class? budget!
# what is the frequency of testing taxa?

# related reading

# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5500893/
# https://yihui.name/en/2013/09/testing-r-packages/
# http://www.win-vector.com/blog/2019/03/unit-tests-in-r/
# RUnit (released June 2004) itself collects test suites from directories and then runs them, recording user assertions in a JUnit-inspired report. The idea is that once you have a bunch of tests you really want to track them some way.
# testthat (released November 2009) self-describes as integrating into a workflow. It runs tests found in the tests/testthat sub-directory (directory found relative to the package source, not relative to an installed package) and tracks user assertions. The related devtools/usethis package both writes a canonical test controlling file into the tests directory (allowing testthat to be triggered by "R CMD check"), and can also directly run tests.
# unitizer (released April 2017) bases its tests on comparisons of objects, rather than comparing text or requiring user assertions. It also aids in producing and updating reference objects.
# tinytest (pre-release)
# tests can be in test/ or inst/
# https://www.mango-solutions.com/blog/analyzing-coverage-of-r-unit-tests-in-packages-the-testcoverage-package
# analysis using revdeps, "These numbers equate to around 8% of R packages on CRAN containing any kind of recognised test framework."
# http://r-pkgs.had.co.nz/tests.html
# https://journal.r-project.org/archive/2012/RJ-2012-018/RJ-2012-018.pdf


## 002-pkg-data-into-tbls.R
library(tidyverse)
pkg_trees <- readRDS("pkg_trees.rds")
cran_pkg_names <- readRDS("cran_pkg_names.rds")

# drop the errors
good_ones <- transpose(pkg_trees)$result
# how many?
length(good_ones)


# get list of dataframes of files for each pkg
x <- map(good_ones, 3, 'tree') %>%
  map(~map_df(.x, ~.x)) %>%
  set_names(cran_pkg_names)

# keep only the files with 'test' in them
x_tests_dir <-
  map(x, ~.x %>% filter(str_detect(path, "tests/"))) %>%
  discard( ~ nrow(.x) == 0)

# how many have tests?
n_pkgs_with_tests <- length(x_tests_dir)

# what proportion of our sample?
n <- length(good_ones)
prop_with_tests <- n_pkgs_with_tests / n
perc_with_tests <-  prop_with_tests * 100

# rough size of tests per pkg
test_file_sizes <-
x_tests_dir %>%
  map(~.x %>% filter(str_detect( path , "\\.R$|\\.r$")))

size_of_tests_per_pkg <-
  map_df(test_file_sizes, ~sum(.x$size, na.rm = TRUE)) %>%
  gather(cran_pkg_names, test_size) %>%
  left_join(
    tibble(cran_pkg_names = cran_pkg_names,
           pkg_size = map_int(x, ~sum(.x$size, na.rm = TRUE)))) %>%
  mutate(test_size_ratio =  test_size / pkg_size)

saveRDS(size_of_tests_per_pkg, "size_of_tests_per_pkg.rds")

size_of_tests_per_pkg_hist_plot <-
  ggplot(size_of_tests_per_pkg,
         aes(test_size_ratio)) +
  geom_histogram() +
  labs(x = "Ratio of test files size to total pkg size",
       y = "Count") +
  ggtitle("Tests in R pkgs on CRAN: How much of the pkg are tests?",
          subtitle = str_glue('{n_pkgs_with_tests} pkgs that have tests, out of a sample of {n} pkg ({perc_with_tests}%)')) +
  theme_minimal(base_size = 14)


## 003-pkg-metadata-from-CRAN.R

# pkg metadata from CRAN ------------------------------------
# https://rviews.rstudio.com/2018/03/08/cran-package-metadata/
# https://www.rstudio.com/resources/videos/what-makes-a-great-r-package/

pkg_DESC <- tools:::CRAN_package_db()
meta_data <- pkg_DESC[, c(1,4,5,17,37,60,61)]
names(meta_data) <- c("Package","Dep","Imp","Aut","Date","RD","RI")
library(stringr)
# A helper function to unlist and string split.
fcn <- function(x,y){
  x <- unlist(x) %>% strsplit(",")
  y <- unlist(y) %>% strsplit(",")
  z <- unlist(na.omit(union(x, y)))
}

meta_data <- mutate(meta_data,
                    DepImp = mapply(fcn,Dep,Imp),
                    RDRI = mapply(fcn,RD,RI))

# CLEAN THE AUTHOR'S FIELD
# Function to remove all text between two brackets
# http://bit.ly/2mE7TNJ
clean <- function(x){
  gsub("\\[[^]]*]", "",x)
}

# Function to remove line breaks
# http://bit.ly/2B0n4VS

clean2 <- function(x){
  gsub("[\r\n]", "", x)
}

# Clean Author's field
meta_data$Aut <- meta_data$Aut %>% map(clean) %>% map(clean2)

rm_na <- function(x){
  list(na.omit(unlist(x)))
}


# Process the fields Aut, Dep, Imp, RD, RI
c_dat1 <- seq_len(nrow(meta_data)) %>%
  map_df(~{
    meta_data[.x, ] %>%
      select(-Package,-DepImp,-RDRI) %>%
      map_df(~ifelse(is.na(.x), 0, length(str_split(.x, ",")[[1]]))) %>%
      mutate(Package = meta_data$Package[.x])
  }) %>%
  select(Package, Aut, Dep, Imp, RD, RI)
# head(c_dat1)

# Process the fields DepImp RDRI
c_dat2 <- seq_len(nrow(meta_data)) %>%
  map_df(~{
    meta_data[.x, ] %>%
      select(-Package, -Aut, -Dep, -Imp, -RD, -RI, -Date) %>%
      map_df(~ifelse(is.na(.x), 0, length(rm_na(.x)[[1]])))
  }) %>%
  select(DepImp, RDRI)

# head(c_dat2)
c_dat <- bind_cols(c_dat1, c_dat2, date = meta_data$Date)

head(c_dat)

saveRDS(c_dat, "c_dat.rds")

ss <- function(x){
  avg <- round(mean(x),digits=2)
  std <- round(sd(x),digits=2)
  med <- median(x)
  res <- list(mean = avg, sd = std, median = med)
}

res <- cbind(names(c_dat[,-c(1,9)]), map_df(c_dat[,-c(1,9)],ss))
names(res) <- c("Features", "mean", "sd", "median")
res

quantile(c_dat$RDRI)

## 004-scatterplots-of-tests-and-metadata.R
library(tidyverse)
library(lubridate)

size_of_tests_per_pkg <- readRDS("size_of_tests_per_pkg.rds")
c_dat <- readRDS("c_dat.rds")

pkg_test_size_desc <-
  left_join(size_of_tests_per_pkg,
            c_dat, by = c("cran_pkg_names" = "Package")) %>%
  mutate(date_fmt = ymd(date)) %>%
  mutate(year = round_date(date_fmt, unit = "year"))

saveRDS(pkg_test_size_desc, "pkg_test_size_desc.rds")

# test ratio and Aut/RD/Date/pkg_size
test_ratio_vs_various_facet_plot <-
  pkg_test_size_desc %>%
  select(test_size_ratio,
         Aut,
         RDRI,
         pkg_size) %>%
  gather(variable,
         value,
         -test_size_ratio) %>%
  ggplot(aes(test_size_ratio,
             value)) +
  geom_point() +
  scale_y_log10() +
  facet_wrap( ~ variable,
              scales = "free") +
  theme_minimal(base_size = 14)

library(cowplot)
plot_grid(size_of_tests_per_pkg_hist_plot,
          test_ratio_vs_various_facet_plot,
          ncol = 1)

## 005-pkg-birth-dates-from-CRAN.R
# get pkg dates from here
library(rvest)
library(tidyverse)
library(lubridate)
library(ggbeeswarm)

# we need to look at the CRAN archive page for every pkg to get the first date

cran_pkg_names <- readRDS("cran_pkg_names.rds")

archive_url <- "https://cran.r-project.org/src/contrib/Archive/"

archive_urls <- map_chr(cran_pkg_names, ~str_glue('{archive_url}{.x}'))

get_archive_tables_safely <-
  safely(function(x) read_html(x) %>%
            html_nodes("table") %>%
            html_table() %>%
           .[[1]])

archive_tbls_lst <-
  map(archive_urls[1:1000], ~get_archive_tables_safely(.x))

archive_tbls_lst_result <-
transpose(archive_tbls_lst)$result


#-------------------------------
the_url <- "https://cran.r-project.org/web/packages/available_packages_by_date.html"


cran_dates <- read_html(the_url)

cran_dates_tbl <-
  cran_dates %>%
  html_nodes("table") %>%
  html_table()

cran_dates_tbl <- cran_dates_tbl[[1]]

saveRDS(cran_dates_tbl, "cran_dates_tbl.rds")
cran_dates_tbl <- readRDS("cran_dates_tbl.rds")
pkg_test_size_desc <- readRDS("pkg_test_size_desc.rds")

pkg_test_size_desc_birth_dates <-
left_join(cran_dates_tbl,
          pkg_test_size_desc,
          by = c("Package" = "cran_pkg_names")) %>%
  mutate(birth_date = ymd(Date)) %>%
  mutate(birth_year = round_date(birth_date, unit = "year"))  %>%
  mutate(birth_year = str_trunc(as.character(birth_year), 4, side = "right", ellipsis = ""))

ggplot(pkg_test_size_desc_birth_dates,
       aes(birth_year,
           test_size_ratio)) +
  geom_boxplot() +
  geom_quasirandom(alpha = 0.2) +
  labs(y = "Ratio of test file size to pkg size",
       x = "Year of pkg origin") +
  scale_y_log10() +
  theme_minimal(base_size = 14)

summary(pkg_test_size_desc_birth_dates)

## 006-clone-pkg-from-cran-github.R

# download all github.com/cran repos
library(tidyverse)
library(git2r)
library(gh)

.token = ""  # get one from https://github.com/settings/tokens

# get all pkgs on GitHub mirror of CRAN
n <-  16125 # 16125 # https://github.com/cran
cran_repos <- gh("/users/:username/repos",
                 username = "cran",
                 .limit = n,
                 .token = .token )

# extract pkg names
cran_pkg_names <- vapply(cran_repos, "[[", "", "name")

# download source code of each pkg
map(cran_pkg_names[14786:n], ~clone(str_glue("https://github.com/cran/{.x}"),
                           local_path = .x))

# error checking after a few false startss
 match("TeXCheckR",  cran_pkg_names )

## 007-covr-of-ctv-pkgs
library(ctv)
library(tidyverse)
library(pak)
library(lubridate)
library(ggridges)
conflicted::conflict_prefer("filter", "dplyr")

# get view metadata
tv <- available.views() # this takes a while

# package list
dat <- tv %>%
  transpose()

# tag with taskview identifier and bind in one df
taskview_pkg_id <- dat %>%
  as_tibble() %>%
  mutate(pkg = map2(name, packagelist, .f = function(x, y){
    y %>%
      mutate(taskview = x,
             no_pkgs_in_tv = nrow(y))
  })) %>%
  pluck("pkg") %>%
  bind_rows() %>%
  rename(taskview_core = core)

# for each pkg in ctv, download the
# src from github.com/cran ...
# first apt-get install a bunch of things...

library(git2r)
library(gh)

safe_clone <- safely(git2r::clone)
safe_local_install <- safely(pak::pkg_install)
safe_covr_check <- safely(covr::package_coverage)
safe_remove <- safely(pak::pkg_remove)

n <- length(taskview_pkg_id$name)

covr_results <- vector("list", length = n )

for(i in 1:n){

  # clone from github
  message(paste0("Now we are cloning ", taskview_pkg_id$name[i]))
  safe_clone(str_glue("https://github.com/cran/{taskview_pkg_id$name[i]}"),
             local_path = taskview_pkg_id$name[i])

  # install from source, need to do this to get deps
  message(paste0("Now we are installing ", taskview_pkg_id$name[i]))
  safe_local_install(taskview_pkg_id$name[i],
                     ask = FALSE)

  message(paste0("Now we are running covr on ", taskview_pkg_id$name[i]))
  covr_results[[i]] <-
    safe_covr_check(taskview_pkg_id$name[i])

  safe_remove(taskview_pkg_id$name[i])
  unlink(taskview_pkg_id$name[i], recursive = TRUE )
  unlink( str_subset (list.files(path = "/usr/local/lib/R/site-library//",
                                 full.names = TRUE),
                      "00LOCK"),
          force = TRUE,
          recursive = TRUE)

  message(paste0("Now we have completed ", i, " of ", n, " packages"))
  gc()
}

names(covr_results) <- taskview_pkg_id$name

covr_results_tr <-
  purrr::transpose(covr_results)$result

saveRDS(covr_results_tr, "covr_results_tr.rds")


## 999-docker-details
sudo apt install docker.io

sudo docker run -e PASSWORD=ben -e ROOT=TRUE --rm -p 8787:8787 -m=6G --memory-swap=-1 rocker/verse

sudo apt-get update

sudo apt-get install jags libgsl-dev gtk2.0 xvfb libcairo2-dev xorg openbox xauth -y --no-install-recommends libx11-6 libxss1 libxt6 libxext6 libsm6 libice6 xdg-utils r-cran-cairodevice r-cran-rgtk2 libatk1.0-dev libpango1.0-dev libgtk2.0-dev libglib2.0-dev libcurl4-openssl-dev libgdal-dev libproj-dev

http://localhost:8787/

	library(tidyverse)
	library(gh)

	.token = "" # get one from https://github.com/settings/tokens

	# get all pkgs on GitHub mirror of CRAN
	n <- 5000 # 16125 # https://github.com/cran
	cran_repos <- gh("/users/:username/repos",
	username = "cran",
	.limit = n,
	.token = .token )

	# extract pkg names
	cran_pkg_names <- vapply(cran_repos, "[[", "", "name")
	cran_pkg_created <- vapply(cran_repos, "[[", "", "created_at")

	# how many do we have?
	length(cran_pkg_names)

	# get tree for each pkg

	gh_safely <- safely(gh)

	pkg_trees <-
	map(cran_pkg_names,
	~gh_safely("GET /repos/:username/:repo/git/trees/master?recursive=1",
	username = "cran",
	repo = .x,
	.token = .token))

	saveRDS(pkg_trees, "pkg_trees.rds")
	saveRDS(cran_repos, "cran_repos.rds")
	saveRDS(cran_pkg_names, "cran_pkg_names.rds")
	rm(pkg_trees)
	rm(cran_repos)


	# how many pkgs have tests?
	# how does test presence/absense/size relate to...
	# pkg age/size/contributors/deps
	# where do people put tests? toplevel tests/ or inst/ or what?
	# what frameworks do they use?
	# what testing fns do they use? expect_? are they just testing for correct class? budget!
	# what is the frequency of testing taxa?

	# related reading

	# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5500893/
	# https://yihui.name/en/2013/09/testing-r-packages/
	# http://www.win-vector.com/blog/2019/03/unit-tests-in-r/
	# RUnit (released June 2004) itself collects test suites from directories and then runs them, recording user assertions in a JUnit-inspired report. The idea is that once you have a bunch of tests you really want to track them some way.
	# testthat (released November 2009) self-describes as integrating into a workflow. It runs tests found in the tests/testthat sub-directory (directory found relative to the package source, not relative to an installed package) and tracks user assertions. The related devtools/usethis package both writes a canonical test controlling file into the tests directory (allowing testthat to be triggered by "R CMD check"), and can also directly run tests.
	# unitizer (released April 2017) bases its tests on comparisons of objects, rather than comparing text or requiring user assertions. It also aids in producing and updating reference objects.
	# tinytest (pre-release)
	# tests can be in test/ or inst/
	# https://www.mango-solutions.com/blog/analyzing-coverage-of-r-unit-tests-in-packages-the-testcoverage-package
	# analysis using revdeps, "These numbers equate to around 8% of R packages on CRAN containing any kind of recognised test framework."
	# http://r-pkgs.had.co.nz/tests.html
	# https://journal.r-project.org/archive/2012/RJ-2012-018/RJ-2012-018.pdf
	library(tidyverse)
	pkg_trees <- readRDS("pkg_trees.rds")
	cran_pkg_names <- readRDS("cran_pkg_names.rds")

	# drop the errors
	good_ones <- transpose(pkg_trees)$result
	# how many?
	length(good_ones)


	# get list of dataframes of files for each pkg
	x <- map(good_ones, 3, 'tree') %>%
	map(~map_df(.x, ~.x)) %>%
	set_names(cran_pkg_names)

	# keep only the files with 'test' in them
	x_tests_dir <-
	map(x, ~.x %>% filter(str_detect(path, "tests/"))) %>%
	discard( ~ nrow(.x) == 0)

	# how many have tests?
	n_pkgs_with_tests <- length(x_tests_dir)

	# what proportion of our sample?
	n <- length(good_ones)
	prop_with_tests <- n_pkgs_with_tests / n
	perc_with_tests <- prop_with_tests * 100

	# rough size of tests per pkg
	test_file_sizes <-
	x_tests_dir %>%
	map(~.x %>% filter(str_detect( path , "\\.R$\|\\.r$")))

	size_of_tests_per_pkg <-
	map_df(test_file_sizes, ~sum(.x$size, na.rm = TRUE)) %>%
	gather(cran_pkg_names, test_size) %>%
	left_join(
	tibble(cran_pkg_names = cran_pkg_names,
	pkg_size = map_int(x, ~sum(.x$size, na.rm = TRUE)))) %>%
	mutate(test_size_ratio = test_size / pkg_size)

	saveRDS(size_of_tests_per_pkg, "size_of_tests_per_pkg.rds")

	size_of_tests_per_pkg_hist_plot <-
	ggplot(size_of_tests_per_pkg,
	aes(test_size_ratio)) +
	geom_histogram() +
	labs(x = "Ratio of test files size to total pkg size",
	y = "Count") +
	ggtitle("Tests in R pkgs on CRAN: How much of the pkg are tests?",
	subtitle = str_glue('{n_pkgs_with_tests} pkgs that have tests, out of a sample of {n} pkg ({perc_with_tests}%)')) +
	theme_minimal(base_size = 14)

	# pkg metadata from CRAN ------------------------------------
	# https://rviews.rstudio.com/2018/03/08/cran-package-metadata/
	# https://www.rstudio.com/resources/videos/what-makes-a-great-r-package/

	pkg_DESC <- tools:::CRAN_package_db()
	meta_data <- pkg_DESC[, c(1,4,5,17,37,60,61)]
	names(meta_data) <- c("Package","Dep","Imp","Aut","Date","RD","RI")
	library(stringr)
	# A helper function to unlist and string split.
	fcn <- function(x,y){
	x <- unlist(x) %>% strsplit(",")
	y <- unlist(y) %>% strsplit(",")
	z <- unlist(na.omit(union(x, y)))
	}

	meta_data <- mutate(meta_data,
	DepImp = mapply(fcn,Dep,Imp),
	RDRI = mapply(fcn,RD,RI))

	# CLEAN THE AUTHOR'S FIELD
	# Function to remove all text between two brackets
	# http://bit.ly/2mE7TNJ
	clean <- function(x){
	gsub("\\[[^]]*]", "",x)
	}

	# Function to remove line breaks
	# http://bit.ly/2B0n4VS

	clean2 <- function(x){
	gsub("[\r\n]", "", x)
	}

	# Clean Author's field
	meta_data$Aut <- meta_data$Aut %>% map(clean) %>% map(clean2)

	rm_na <- function(x){
	list(na.omit(unlist(x)))
	}


	# Process the fields Aut, Dep, Imp, RD, RI
	c_dat1 <- seq_len(nrow(meta_data)) %>%
	map_df(~{
	meta_data[.x, ] %>%
	select(-Package,-DepImp,-RDRI) %>%
	map_df(~ifelse(is.na(.x), 0, length(str_split(.x, ",")[[1]]))) %>%
	mutate(Package = meta_data$Package[.x])
	}) %>%
	select(Package, Aut, Dep, Imp, RD, RI)
	# head(c_dat1)

	# Process the fields DepImp RDRI
	c_dat2 <- seq_len(nrow(meta_data)) %>%
	map_df(~{
	meta_data[.x, ] %>%
	select(-Package, -Aut, -Dep, -Imp, -RD, -RI, -Date) %>%
	map_df(~ifelse(is.na(.x), 0, length(rm_na(.x)[[1]])))
	}) %>%
	select(DepImp, RDRI)

	# head(c_dat2)
	c_dat <- bind_cols(c_dat1, c_dat2, date = meta_data$Date)

	head(c_dat)

	saveRDS(c_dat, "c_dat.rds")

	ss <- function(x){
	avg <- round(mean(x),digits=2)
	std <- round(sd(x),digits=2)
	med <- median(x)
	res <- list(mean = avg, sd = std, median = med)
	}

	res <- cbind(names(c_dat[,-c(1,9)]), map_df(c_dat[,-c(1,9)],ss))
	names(res) <- c("Features", "mean", "sd", "median")
	res

	quantile(c_dat$RDRI)
	library(tidyverse)
	library(lubridate)

	size_of_tests_per_pkg <- readRDS("size_of_tests_per_pkg.rds")
	c_dat <- readRDS("c_dat.rds")

	pkg_test_size_desc <-
	left_join(size_of_tests_per_pkg,
	c_dat, by = c("cran_pkg_names" = "Package")) %>%
	mutate(date_fmt = ymd(date)) %>%
	mutate(year = round_date(date_fmt, unit = "year"))

	saveRDS(pkg_test_size_desc, "pkg_test_size_desc.rds")

	# test ratio and Aut/RD/Date/pkg_size
	test_ratio_vs_various_facet_plot <-
	pkg_test_size_desc %>%
	select(test_size_ratio,
	Aut,
	RDRI,
	pkg_size) %>%
	gather(variable,
	value,
	-test_size_ratio) %>%
	ggplot(aes(test_size_ratio,
	value)) +
	geom_point() +
	scale_y_log10() +
	facet_wrap( ~ variable,
	scales = "free") +
	theme_minimal(base_size = 14)

	library(cowplot)
	plot_grid(size_of_tests_per_pkg_hist_plot,
	test_ratio_vs_various_facet_plot,
	ncol = 1)
	# get pkg dates from here
	library(rvest)
	library(tidyverse)
	library(lubridate)
	library(ggbeeswarm)

	# we need to look at the CRAN archive page for every pkg to get the first date

	cran_pkg_names <- readRDS("cran_pkg_names.rds")

	archive_url <- "https://cran.r-project.org/src/contrib/Archive/"

	archive_urls <- map_chr(cran_pkg_names, ~str_glue('{archive_url}{.x}'))

	get_archive_tables_safely <-
	safely(function(x) read_html(x) %>%
	html_nodes("table") %>%
	html_table() %>%
	.[[1]])

	archive_tbls_lst <-
	map(archive_urls[1:1000], ~get_archive_tables_safely(.x))

	archive_tbls_lst_result <-
	transpose(archive_tbls_lst)$result







	#-------------------------------
	the_url <- "https://cran.r-project.org/web/packages/available_packages_by_date.html"


	cran_dates <- read_html(the_url)

	cran_dates_tbl <-
	cran_dates %>%
	html_nodes("table") %>%
	html_table()

	cran_dates_tbl <- cran_dates_tbl[[1]]

	saveRDS(cran_dates_tbl, "cran_dates_tbl.rds")
	cran_dates_tbl <- readRDS("cran_dates_tbl.rds")
	pkg_test_size_desc <- readRDS("pkg_test_size_desc.rds")

	pkg_test_size_desc_birth_dates <-
	left_join(cran_dates_tbl,
	pkg_test_size_desc,
	by = c("Package" = "cran_pkg_names")) %>%
	mutate(birth_date = ymd(Date)) %>%
	mutate(birth_year = round_date(birth_date, unit = "year")) %>%
	mutate(birth_year = str_trunc(as.character(birth_year), 4, side = "right", ellipsis = ""))

	ggplot(pkg_test_size_desc_birth_dates,
	aes(birth_year,
	test_size_ratio)) +
	geom_boxplot() +
	geom_quasirandom(alpha = 0.2) +
	labs(y = "Ratio of test file size to pkg size",
	x = "Year of pkg origin") +
	scale_y_log10() +
	theme_minimal(base_size = 14)

	summary(pkg_test_size_desc_birth_dates)

	# download all github.com/cran repos
	library(tidyverse)
	library(git2r)
	library(gh)

	.token = "" # get one from https://github.com/settings/tokens

	# get all pkgs on GitHub mirror of CRAN
	n <- 16125 # 16125 # https://github.com/cran
	cran_repos <- gh("/users/:username/repos",
	username = "cran",
	.limit = n,
	.token = .token )

	# extract pkg names
	cran_pkg_names <- vapply(cran_repos, "[[", "", "name")

	# download source code of each pkg
	map(cran_pkg_names[14786:n], ~clone(str_glue("https://github.com/cran/{.x}"),
	local_path = .x))

	# error checking after a few false startss
	match("TeXCheckR", cran_pkg_names )
	library(ctv)
	library(tidyverse)
	library(pak)
	library(lubridate)
	library(ggridges)
	conflicted::conflict_prefer("filter", "dplyr")

	# get view metadata
	tv <- available.views() # this takes a while

	# package list
	dat <- tv %>%
	transpose()

	# tag with taskview identifier and bind in one df
	taskview_pkg_id <- dat %>%
	as_tibble() %>%
	mutate(pkg = map2(name, packagelist, .f = function(x, y){
	y %>%
	mutate(taskview = x,
	no_pkgs_in_tv = nrow(y))
	})) %>%
	pluck("pkg") %>%
	bind_rows() %>%
	rename(taskview_core = core)

	# for each pkg in ctv, download the
	# src from github.com/cran ...
	# first apt-get install a bunch of things...

	library(git2r)
	library(gh)

	safe_clone <- safely(git2r::clone)
	safe_local_install <- safely(pak::pkg_install)
	safe_covr_check <- safely(covr::package_coverage)
	safe_remove <- safely(pak::pkg_remove)

	n <- length(taskview_pkg_id$name)

	covr_results <- vector("list", length = n )

	for(i in 1:n){

	# clone from github
	message(paste0("Now we are cloning ", taskview_pkg_id$name[i]))
	safe_clone(str_glue("https://github.com/cran/{taskview_pkg_id$name[i]}"),
	local_path = taskview_pkg_id$name[i])

	# install from source, need to do this to get deps
	message(paste0("Now we are installing ", taskview_pkg_id$name[i]))
	safe_local_install(taskview_pkg_id$name[i],
	ask = FALSE)

	message(paste0("Now we are running covr on ", taskview_pkg_id$name[i]))
	covr_results[[i]] <-
	safe_covr_check(taskview_pkg_id$name[i])

	safe_remove(taskview_pkg_id$name[i])
	unlink(taskview_pkg_id$name[i], recursive = TRUE )
	unlink( str_subset (list.files(path = "/usr/local/lib/R/site-library//",
	full.names = TRUE),
	"00LOCK"),
	force = TRUE,
	recursive = TRUE)

	message(paste0("Now we have completed ", i, " of ", n, " packages"))
	gc()
	}

	names(covr_results) <- taskview_pkg_id$name

	covr_results_tr <-
	purrr::transpose(covr_results)$result

	saveRDS(covr_results_tr, "covr_results_tr.rds")
	sudo apt install docker.io

	sudo docker run -e PASSWORD=ben -e ROOT=TRUE --rm -p 8787:8787 -m=6G --memory-swap=-1 rocker/verse

	sudo apt-get update

	sudo apt-get install jags libgsl-dev gtk2.0 xvfb libcairo2-dev xorg openbox xauth -y --no-install-recommends libx11-6 libxss1 libxt6 libxext6 libsm6 libice6 xdg-utils r-cran-cairodevice r-cran-rgtk2 libatk1.0-dev libpango1.0-dev libgtk2.0-dev libglib2.0-dev libcurl4-openssl-dev libgdal-dev libproj-dev

	http://localhost:8787/