-
-
Save karldw/1038f9b8605b4ee97842f4b65ed6c09f to your computer and use it in GitHub Desktop.
Parse thirdparty version file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
```r | |
..substitute_like_bash <- function(one_string, possible_values) { | |
stopifnot( | |
rlang::is_dictionaryish(possible_values), | |
length(one_string) == 1, | |
!anyNA(possible_values) | |
) | |
# Find the name of the version we want, something like | |
# ARROW_RAPIDJSON_BUILD_VERSION | |
# The `(//./_|:1)` is a special case to handle some bash fanciness. | |
version_regex <- "\\$\\{(ARROW_[A-Z0-9_]+?_VERSION)(//./_|:1)?\\}" | |
match_mat <- regmatches( | |
one_string, | |
gregexec(version_regex, one_string, perl = TRUE) | |
)[[1]] # Subset [[1]] because one_string has length 1 | |
# But if there are multiple matches, string_to_sub could still have length > 1 | |
string_to_sub <- match_mat[1, , drop = TRUE] | |
version_varnames <- match_mat[2, , drop = TRUE] | |
version_values <- possible_values[version_varnames] | |
bash_special_cases <- match_mat[3, , drop = TRUE] | |
version_values = ifelse( | |
bash_special_cases == "", version_values, ifelse( | |
bash_special_cases == ":1", substring(version_values, 2), ifelse( | |
bash_special_cases == "//./_", gsub(".", "_", version_values, fixed = TRUE), | |
NA_character_ # otherwise | |
))) | |
num_to_sub <- length(string_to_sub) | |
stopifnot( | |
all(version_varnames %in% names(possible_values)), | |
!anyNA(version_values), | |
num_to_sub >= 1, | |
num_to_sub < 10 # Something has gone wrong if we're doing 10+ | |
) | |
out <- one_string | |
for (idx in seq_len(num_to_sub)) { | |
# not gsub in case there are duplicates | |
out <- sub(string_to_sub[idx], version_values[idx], out, fixed = TRUE) | |
} | |
out | |
} | |
..substitute_all <- function(deps_unsubstituted, possible_values) { | |
file_substituted = vapply( | |
deps_unsubstituted$filenames, | |
..substitute_like_bash, | |
FUN.VALUE = character(1), | |
possible_values = possible_values | |
) | |
url_substituted = vapply( | |
deps_unsubstituted$urls, | |
..substitute_like_bash, | |
FUN.VALUE = character(1), | |
possible_values = possible_values | |
) | |
list( | |
filenames = unname(file_substituted), | |
urls = unname(url_substituted) | |
) | |
} | |
..parse_version_lines <- function(version_lines) { | |
version_lines <- trimws(version_lines) | |
version_regex <- "^(ARROW_[A-Z0-9_]+_)(VERSION|SHA256_CHECKSUM)=([^=]+)$" | |
if (!all(grepl(version_regex, version_lines, perl=TRUE))) { | |
stop("Failed to parse version lines") | |
} | |
match_list <- regmatches( | |
version_lines, | |
regexec(version_regex, version_lines, perl = TRUE) | |
) | |
# Find the lines where the second regex match group is that are "VERSION" (as | |
# opposed to "SHA256_CHECKSUM") | |
version_idx <- vapply( | |
match_list, | |
function(m) m[[3]] == "VERSION", | |
FUN.VALUE = logical(1) | |
) | |
version_matches = match_list[version_idx] | |
# Fancy indexing here is just to pull the first and second regex match out, | |
# e.g. "ARROW_RAPIDJSON_BUILD_" and "VERSION" | |
version_varnames = vapply( | |
version_matches, | |
function(m) paste0(m[[2]], m[[3]]), | |
FUN.VALUE = character(1) | |
) | |
version_values <- vapply(version_matches, | |
function(m) m[[4]], | |
FUN.VALUE = character(1) | |
) | |
names(version_values) <- version_varnames | |
return(version_values) | |
} | |
..parse_dependency_array <- function(array_lines) { | |
stopifnot( | |
length(array_lines) >= 1, | |
is.character(array_lines), | |
!anyNA(array_lines) | |
) | |
array_lines <- trimws(array_lines) | |
# Parse the array_lines with a regex. Each line of the array is a different | |
# component, e.g. | |
# `"ARROW_RAPIDJSON_URL rapidjson-${ARROW_RAPIDJSON_BUILD_VERSION}.tar.gz https://github.com/miloyip/rapidjson/archive/${ARROW_RAPIDJSON_BUILD_VERSION}.tar.gz"` | |
# The first element is the variable name of the URL. This matters for cmake, | |
# but not here. The second is the filename that will be saved (no directory). | |
# The third is the URL, including some version string that's defined earlier | |
# in the file. | |
# Regex in words: | |
# Start with `"ARROW_`, then any capital ASCII letter, number, or underscore. | |
# After a space, find anything except a space, colon, or forward slash. (No | |
# space is essential, and would be essential to parsing the array in bash. | |
# The colon and slash are just basic guards that this is a filename.) Next, a | |
# space. Then a URL, starting with https://, and including anything except a | |
# space. (This is the URL before substituting in the version sting, so normal | |
# URL parsing rules don't apply.) | |
dep_array_regex <- '^"(ARROW_[A-Z0-9_]+_URL) ([^ :/"]+) (https://[^ "]+)"$' | |
if (!all(grepl(dep_array_regex, array_lines, perl = TRUE))) { | |
stop("Cannot parse thirdparty dependency array in expected format.") | |
} | |
list( | |
filenames = gsub(dep_array_regex, "\\2", array_lines, perl = TRUE), | |
urls = gsub(dep_array_regex, "\\3", array_lines, perl = TRUE) | |
) | |
} | |
..parse_lines <- function(versions_file) { | |
orig_lines <- readLines(versions_file) | |
lines <- gsub("#.*", "", orig_lines, perl=TRUE) | |
lines <- lines[lines != ""] | |
dep_array_start_idx <- grep("^DEPENDENCIES=\\($", lines, perl=TRUE) | |
dep_array_lines <- lines[ | |
seq.int(from = dep_array_start_idx + 1, to = length(lines) - 1, by = 1) | |
] | |
version_lines <- lines[seq.int(1, dep_array_start_idx - 1, by=1)] | |
version_info = ..parse_version_lines(version_lines) | |
failed_to_parse <- anyNA(orig_lines) || | |
length(orig_lines) > 1000 || | |
length(lines) == 0 || | |
length(dep_array_start_idx) != 1 || | |
dep_array_start_idx <= 1 || | |
dep_array_start_idx >= length(lines) - 3 || | |
lines[length(lines)] != ")" || | |
length(dep_array_lines) == 0 || | |
anyNA(version_info) | |
if (failed_to_parse) { | |
stop( | |
"Failed to parse 3rd party dependency file. It's possible the function ", | |
"is not reading the correct file or the file formatting was not what ", | |
"was expected.", | |
call. = FALSE | |
) | |
} | |
deps_unsubstituted <- ..parse_dependency_array(dep_array_lines) | |
..substitute_all(deps_unsubstituted, possible_values = version_info) | |
} | |
..download_dependencies <- function(dep_info, download_dir) { | |
stopifnot( | |
length(dep_info$urls) == length(dep_info$filenames), | |
length(dep_info$urls) > 0 | |
) | |
download_dir <- normalizePath(download_dir, winslash = "/", mustWork = TRUE) | |
full_filenames <- file.path(download_dir, dep_info$filenames, fsep = "/") | |
# Using libcurl here is well supported in R, but is a different download | |
# engine than the wget in download_dependencies.sh | |
# libcurl is required for supplying multiple URLs, and is available in all | |
# CRAN builds, but isn't guaranteed. | |
download.file(dep_info$urls, full_filenames, method = "libcurl", quiet = TRUE) | |
} | |
library(testthat) | |
test_that("bash-equivalent substitution works", { | |
vals = c(ARROW_ZZZ_VERSION = "c.2", ARROW_AAA_VERSION = "v7") | |
expect_equal( | |
..substitute_like_bash("x ${ARROW_ZZZ_VERSION} _", vals), "x c.2 _" | |
) | |
expect_equal( | |
..substitute_like_bash("https://example.com/${ARROW_AAA_VERSION:1}", vals), | |
"https://example.com/7" | |
) | |
expect_equal( | |
..substitute_like_bash("x ${ARROW_ZZZ_VERSION//./_} .", vals), "x c_2 ." | |
) | |
}) | |
# Memoize for demo/testing: | |
if (!memoise::is.memoized(download.file)) { | |
download.file <- memoise::memoize(utils::download.file) | |
} | |
# Could use your local version instead. | |
versions_file <- "https://raw.githubusercontent.com/apache/arrow/master/cpp/thirdparty/versions.txt" | |
dep_list <- ..parse_lines(versions_file) | |
download_dir <- file.path(tempdir(), "thirdparty") | |
dir.create(download_dir) | |
..download_dependencies(dep_list, download_dir) | |
list.files(download_dir) | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment