Skip to content

Instantly share code, notes, and snippets.

@karldw
Created February 23, 2022 23:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save karldw/1038f9b8605b4ee97842f4b65ed6c09f to your computer and use it in GitHub Desktop.
Save karldw/1038f9b8605b4ee97842f4b65ed6c09f to your computer and use it in GitHub Desktop.
Parse thirdparty version file
```r
..substitute_like_bash <- function(one_string, possible_values) {
stopifnot(
rlang::is_dictionaryish(possible_values),
length(one_string) == 1,
!anyNA(possible_values)
)
# Find the name of the version we want, something like
# ARROW_RAPIDJSON_BUILD_VERSION
# The `(//./_|:1)` is a special case to handle some bash fanciness.
version_regex <- "\\$\\{(ARROW_[A-Z0-9_]+?_VERSION)(//./_|:1)?\\}"
match_mat <- regmatches(
one_string,
gregexec(version_regex, one_string, perl = TRUE)
)[[1]] # Subset [[1]] because one_string has length 1
# But if there are multiple matches, string_to_sub could still have length > 1
string_to_sub <- match_mat[1, , drop = TRUE]
version_varnames <- match_mat[2, , drop = TRUE]
version_values <- possible_values[version_varnames]
bash_special_cases <- match_mat[3, , drop = TRUE]
version_values = ifelse(
bash_special_cases == "", version_values, ifelse(
bash_special_cases == ":1", substring(version_values, 2), ifelse(
bash_special_cases == "//./_", gsub(".", "_", version_values, fixed = TRUE),
NA_character_ # otherwise
)))
num_to_sub <- length(string_to_sub)
stopifnot(
all(version_varnames %in% names(possible_values)),
!anyNA(version_values),
num_to_sub >= 1,
num_to_sub < 10 # Something has gone wrong if we're doing 10+
)
out <- one_string
for (idx in seq_len(num_to_sub)) {
# not gsub in case there are duplicates
out <- sub(string_to_sub[idx], version_values[idx], out, fixed = TRUE)
}
out
}
..substitute_all <- function(deps_unsubstituted, possible_values) {
file_substituted = vapply(
deps_unsubstituted$filenames,
..substitute_like_bash,
FUN.VALUE = character(1),
possible_values = possible_values
)
url_substituted = vapply(
deps_unsubstituted$urls,
..substitute_like_bash,
FUN.VALUE = character(1),
possible_values = possible_values
)
list(
filenames = unname(file_substituted),
urls = unname(url_substituted)
)
}
..parse_version_lines <- function(version_lines) {
version_lines <- trimws(version_lines)
version_regex <- "^(ARROW_[A-Z0-9_]+_)(VERSION|SHA256_CHECKSUM)=([^=]+)$"
if (!all(grepl(version_regex, version_lines, perl=TRUE))) {
stop("Failed to parse version lines")
}
match_list <- regmatches(
version_lines,
regexec(version_regex, version_lines, perl = TRUE)
)
# Find the lines where the second regex match group is that are "VERSION" (as
# opposed to "SHA256_CHECKSUM")
version_idx <- vapply(
match_list,
function(m) m[[3]] == "VERSION",
FUN.VALUE = logical(1)
)
version_matches = match_list[version_idx]
# Fancy indexing here is just to pull the first and second regex match out,
# e.g. "ARROW_RAPIDJSON_BUILD_" and "VERSION"
version_varnames = vapply(
version_matches,
function(m) paste0(m[[2]], m[[3]]),
FUN.VALUE = character(1)
)
version_values <- vapply(version_matches,
function(m) m[[4]],
FUN.VALUE = character(1)
)
names(version_values) <- version_varnames
return(version_values)
}
..parse_dependency_array <- function(array_lines) {
stopifnot(
length(array_lines) >= 1,
is.character(array_lines),
!anyNA(array_lines)
)
array_lines <- trimws(array_lines)
# Parse the array_lines with a regex. Each line of the array is a different
# component, e.g.
# `"ARROW_RAPIDJSON_URL rapidjson-${ARROW_RAPIDJSON_BUILD_VERSION}.tar.gz https://github.com/miloyip/rapidjson/archive/${ARROW_RAPIDJSON_BUILD_VERSION}.tar.gz"`
# The first element is the variable name of the URL. This matters for cmake,
# but not here. The second is the filename that will be saved (no directory).
# The third is the URL, including some version string that's defined earlier
# in the file.
# Regex in words:
# Start with `"ARROW_`, then any capital ASCII letter, number, or underscore.
# After a space, find anything except a space, colon, or forward slash. (No
# space is essential, and would be essential to parsing the array in bash.
# The colon and slash are just basic guards that this is a filename.) Next, a
# space. Then a URL, starting with https://, and including anything except a
# space. (This is the URL before substituting in the version sting, so normal
# URL parsing rules don't apply.)
dep_array_regex <- '^"(ARROW_[A-Z0-9_]+_URL) ([^ :/"]+) (https://[^ "]+)"$'
if (!all(grepl(dep_array_regex, array_lines, perl = TRUE))) {
stop("Cannot parse thirdparty dependency array in expected format.")
}
list(
filenames = gsub(dep_array_regex, "\\2", array_lines, perl = TRUE),
urls = gsub(dep_array_regex, "\\3", array_lines, perl = TRUE)
)
}
..parse_lines <- function(versions_file) {
orig_lines <- readLines(versions_file)
lines <- gsub("#.*", "", orig_lines, perl=TRUE)
lines <- lines[lines != ""]
dep_array_start_idx <- grep("^DEPENDENCIES=\\($", lines, perl=TRUE)
dep_array_lines <- lines[
seq.int(from = dep_array_start_idx + 1, to = length(lines) - 1, by = 1)
]
version_lines <- lines[seq.int(1, dep_array_start_idx - 1, by=1)]
version_info = ..parse_version_lines(version_lines)
failed_to_parse <- anyNA(orig_lines) ||
length(orig_lines) > 1000 ||
length(lines) == 0 ||
length(dep_array_start_idx) != 1 ||
dep_array_start_idx <= 1 ||
dep_array_start_idx >= length(lines) - 3 ||
lines[length(lines)] != ")" ||
length(dep_array_lines) == 0 ||
anyNA(version_info)
if (failed_to_parse) {
stop(
"Failed to parse 3rd party dependency file. It's possible the function ",
"is not reading the correct file or the file formatting was not what ",
"was expected.",
call. = FALSE
)
}
deps_unsubstituted <- ..parse_dependency_array(dep_array_lines)
..substitute_all(deps_unsubstituted, possible_values = version_info)
}
..download_dependencies <- function(dep_info, download_dir) {
stopifnot(
length(dep_info$urls) == length(dep_info$filenames),
length(dep_info$urls) > 0
)
download_dir <- normalizePath(download_dir, winslash = "/", mustWork = TRUE)
full_filenames <- file.path(download_dir, dep_info$filenames, fsep = "/")
# Using libcurl here is well supported in R, but is a different download
# engine than the wget in download_dependencies.sh
# libcurl is required for supplying multiple URLs, and is available in all
# CRAN builds, but isn't guaranteed.
download.file(dep_info$urls, full_filenames, method = "libcurl", quiet = TRUE)
}
library(testthat)
test_that("bash-equivalent substitution works", {
vals = c(ARROW_ZZZ_VERSION = "c.2", ARROW_AAA_VERSION = "v7")
expect_equal(
..substitute_like_bash("x ${ARROW_ZZZ_VERSION} _", vals), "x c.2 _"
)
expect_equal(
..substitute_like_bash("https://example.com/${ARROW_AAA_VERSION:1}", vals),
"https://example.com/7"
)
expect_equal(
..substitute_like_bash("x ${ARROW_ZZZ_VERSION//./_} .", vals), "x c_2 ."
)
})
# Memoize for demo/testing:
if (!memoise::is.memoized(download.file)) {
download.file <- memoise::memoize(utils::download.file)
}
# Could use your local version instead.
versions_file <- "https://raw.githubusercontent.com/apache/arrow/master/cpp/thirdparty/versions.txt"
dep_list <- ..parse_lines(versions_file)
download_dir <- file.path(tempdir(), "thirdparty")
dir.create(download_dir)
..download_dependencies(dep_list, download_dir)
list.files(download_dir)
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment