Last active
August 29, 2015 14:19
-
-
Save bearloga/cf21f39cea2553cb1594 to your computer and use it in GitHub Desktop.
Obtaining licensing metadata for task-specific R packages on CRAN (…for the purpose of, say, avoiding GPL-licensed packages)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# devtools::install_github("RcppCore/Rcpp") | |
# devtools::install_github("hadley/xml2") | |
# devtools::install_github("hadley/rvest") | |
# devtools::install_github("metacran/crandb") | |
# devtools::install_github('ironholds/practice') | |
# install.packages('dplyr') | |
library(rvest); library(crandb) | |
get_licensing_for_task <- function(task_url) { | |
elapsed <- system.time({ | |
task <- read_html(task_url) | |
# Fetch packages listed in task | |
task_packages <- task %>% | |
html_nodes("body ul li a") %>% | |
html_text %>% | |
grep(pattern="\\n",x=.,invert=TRUE,value=TRUE) %>% | |
unique | |
# Fetch license metadata for each package in task | |
licensing_metadata <- sapply(task_packages, . %>% | |
practice:::get_package_metadata() %>% | |
{ tail(.$versions,1)[[1]] } %>% | |
{ c(Package=.$Package,License=.$License) }) %>% | |
t %>% as.data.frame %>% | |
{ rownames(.) <- 1:nrow(.); . } | |
})['elapsed'] | |
cat("Finished processing",task_url,"(took",elapsed,"seconds)","\n") | |
return(licensing_metadata) | |
} | |
# Fetch tasks: | |
tasks <- read_html("http://cran.r-project.org/web/views/") | |
tasks %<>% html_nodes("tr") %>% sapply(. %>% { | |
c(html_nodes(.,"td") %>% { .[2] } %>% html_text, | |
html_nodes(., "a") %>% html_attr("href")) | |
}) %>% t | |
colnames(tasks) <- c("Task","Webpage") | |
#> head(tasks,1) | |
# Task Webpage | |
# [1,] "Bayesian Inference" "Bayesian.html" | |
# Obtain licensing information for the packages in each task: | |
licenses <- sapply(tasks[,'Webpage'],. %>% | |
paste0('http://cran.r-project.org/web/views/',.) %>% | |
get_licensing_for_task) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Check out get_package_names() for added simplification, in practise. I forsee this being a really fun exploratory tool \o/