karldw/parse_thirdparty_versions.R Secret

## parse_thirdparty_versions.R
```r

..substitute_like_bash <- function(one_string, possible_values) {
  stopifnot(
    rlang::is_dictionaryish(possible_values),
    length(one_string) == 1,
    !anyNA(possible_values)
  )

  # Find the name of the version we want, something like
  # ARROW_RAPIDJSON_BUILD_VERSION
  # The `(//./_|:1)` is a special case to handle some bash fanciness.
  version_regex <- "\\$\\{(ARROW_[A-Z0-9_]+?_VERSION)(//./_|:1)?\\}"
  match_mat <- regmatches(
    one_string,
    gregexec(version_regex, one_string, perl = TRUE)
  )[[1]]  # Subset [[1]] because one_string has length 1
  # But if there are multiple matches, string_to_sub could still have length > 1
  string_to_sub <- match_mat[1, , drop = TRUE]
  version_varnames <- match_mat[2, , drop = TRUE]
  version_values <- possible_values[version_varnames]
  bash_special_cases <- match_mat[3, , drop = TRUE]
  version_values = ifelse(
    bash_special_cases == "", version_values, ifelse(
    bash_special_cases == ":1", substring(version_values, 2), ifelse(
    bash_special_cases == "//./_", gsub(".", "_", version_values, fixed = TRUE),
    NA_character_ # otherwise
  )))
  num_to_sub <- length(string_to_sub)
  stopifnot(
    all(version_varnames %in% names(possible_values)),
    !anyNA(version_values),
    num_to_sub >= 1,
    num_to_sub < 10  # Something has gone wrong if we're doing 10+
  )
  out <- one_string
  for (idx in seq_len(num_to_sub)) {
    # not gsub in case there are duplicates
    out <- sub(string_to_sub[idx], version_values[idx], out, fixed = TRUE)
  }
  out
}


..substitute_all <- function(deps_unsubstituted, possible_values) {
  file_substituted = vapply(
    deps_unsubstituted$filenames,
    ..substitute_like_bash,
    FUN.VALUE = character(1),
    possible_values = possible_values
  )
  url_substituted  = vapply(
    deps_unsubstituted$urls,
    ..substitute_like_bash,
    FUN.VALUE = character(1),
    possible_values = possible_values
  )
  list(
    filenames = unname(file_substituted),
    urls = unname(url_substituted)
  )
}


..parse_version_lines <- function(version_lines) {
  version_lines <- trimws(version_lines)
  version_regex <- "^(ARROW_[A-Z0-9_]+_)(VERSION|SHA256_CHECKSUM)=([^=]+)$"
  if (!all(grepl(version_regex, version_lines, perl=TRUE))) {
    stop("Failed to parse version lines")
  }

  match_list <- regmatches(
    version_lines,
    regexec(version_regex, version_lines, perl = TRUE)
  )
  # Find the lines where the second regex match group is that are "VERSION" (as
  # opposed to "SHA256_CHECKSUM")
  version_idx <- vapply(
    match_list,
    function(m) m[[3]] == "VERSION",
    FUN.VALUE = logical(1)
  )
  version_matches = match_list[version_idx]
  # Fancy indexing here is just to pull the first and second regex match out,
  # e.g. "ARROW_RAPIDJSON_BUILD_" and "VERSION"
  version_varnames = vapply(
    version_matches,
    function(m) paste0(m[[2]], m[[3]]),
    FUN.VALUE = character(1)
  )
  version_values <- vapply(version_matches,
    function(m) m[[4]],
    FUN.VALUE = character(1)
  )
  names(version_values) <- version_varnames
  return(version_values)
}


..parse_dependency_array <- function(array_lines) {
  stopifnot(
    length(array_lines) >= 1,
    is.character(array_lines),
    !anyNA(array_lines)
  )
  array_lines <- trimws(array_lines)

  # Parse the array_lines with a regex. Each line of the array is a different
  # component, e.g.
  # `"ARROW_RAPIDJSON_URL rapidjson-${ARROW_RAPIDJSON_BUILD_VERSION}.tar.gz https://github.com/miloyip/rapidjson/archive/${ARROW_RAPIDJSON_BUILD_VERSION}.tar.gz"`
  # The first element is the variable name of the URL. This matters for cmake,
  # but not here. The second is the filename that will be saved (no directory).
  # The third is the URL, including some version string that's defined earlier
  # in the file.
  # Regex in words:
  # Start with `"ARROW_`, then any capital ASCII letter, number, or underscore.
  # After a space, find anything except a space, colon, or forward slash. (No
  # space is essential, and would be essential to parsing the array in bash.
  # The colon and slash are just basic guards that this is a filename.) Next, a
  # space. Then a URL, starting with https://, and including anything except a
  # space. (This is the URL before substituting in the version sting, so normal
  # URL parsing rules don't apply.)
  dep_array_regex <- '^"(ARROW_[A-Z0-9_]+_URL) ([^ :/"]+) (https://[^ "]+)"$'
  if (!all(grepl(dep_array_regex, array_lines, perl = TRUE))) {
    stop("Cannot parse thirdparty dependency array in expected format.")
  }
  list(
    filenames = gsub(dep_array_regex, "\\2", array_lines, perl = TRUE),
    urls      = gsub(dep_array_regex, "\\3", array_lines, perl = TRUE)
  )
}


..parse_lines <- function(versions_file) {
  orig_lines <- readLines(versions_file)

  lines <- gsub("#.*", "", orig_lines, perl=TRUE)
  lines <- lines[lines != ""]

  dep_array_start_idx <- grep("^DEPENDENCIES=\\($", lines, perl=TRUE)

  dep_array_lines <- lines[
    seq.int(from = dep_array_start_idx + 1, to = length(lines) - 1, by = 1)
  ]

  version_lines <- lines[seq.int(1, dep_array_start_idx - 1, by=1)]
  version_info = ..parse_version_lines(version_lines)

  failed_to_parse <- anyNA(orig_lines) ||
    length(orig_lines) > 1000 ||
    length(lines) == 0 ||
    length(dep_array_start_idx) != 1 ||
    dep_array_start_idx <= 1 ||
    dep_array_start_idx >= length(lines) - 3 ||
    lines[length(lines)] != ")" ||
    length(dep_array_lines) == 0 ||
    anyNA(version_info)

  if (failed_to_parse) {
    stop(
      "Failed to parse 3rd party dependency file. It's possible the function ",
      "is not reading the correct file or the file formatting was not what ",
      "was expected.",
      call. = FALSE
    )
  }
  deps_unsubstituted <- ..parse_dependency_array(dep_array_lines)
  ..substitute_all(deps_unsubstituted, possible_values = version_info)
}


..download_dependencies <- function(dep_info, download_dir) {
  stopifnot(
    length(dep_info$urls) == length(dep_info$filenames),
    length(dep_info$urls) > 0
  )
  download_dir <- normalizePath(download_dir, winslash = "/", mustWork = TRUE)
  full_filenames <- file.path(download_dir, dep_info$filenames, fsep = "/")
  # Using libcurl here is well supported in R, but is a different download
  # engine than the wget in download_dependencies.sh
  # libcurl is required for supplying multiple URLs, and is available in all
  # CRAN builds, but isn't guaranteed.
  download.file(dep_info$urls, full_filenames, method = "libcurl", quiet = TRUE)
}


library(testthat)
test_that("bash-equivalent substitution works", {
  vals = c(ARROW_ZZZ_VERSION = "c.2", ARROW_AAA_VERSION = "v7")
  expect_equal(
    ..substitute_like_bash("x ${ARROW_ZZZ_VERSION} _", vals), "x c.2 _"
  )
  expect_equal(
    ..substitute_like_bash("https://example.com/${ARROW_AAA_VERSION:1}", vals),
    "https://example.com/7"
  )
  expect_equal(
    ..substitute_like_bash("x ${ARROW_ZZZ_VERSION//./_} .", vals), "x c_2 ."
  )
})
# Memoize for demo/testing:
if (!memoise::is.memoized(download.file)) {
  download.file <- memoise::memoize(utils::download.file)
}

# Could use your local version instead.
versions_file <- "https://raw.githubusercontent.com/apache/arrow/master/cpp/thirdparty/versions.txt"
dep_list <- ..parse_lines(versions_file)

download_dir <- file.path(tempdir(), "thirdparty")
dir.create(download_dir)
..download_dependencies(dep_list, download_dir)
list.files(download_dir)

```
	```r

	..substitute_like_bash <- function(one_string, possible_values) {
	stopifnot(
	rlang::is_dictionaryish(possible_values),
	length(one_string) == 1,
	!anyNA(possible_values)
	)

	# Find the name of the version we want, something like
	# ARROW_RAPIDJSON_BUILD_VERSION
	# The `(//./_\|:1)` is a special case to handle some bash fanciness.
	version_regex <- "\\$\\{(ARROW_[A-Z0-9_]+?_VERSION)(//./_\|:1)?\\}"
	match_mat <- regmatches(
	one_string,
	gregexec(version_regex, one_string, perl = TRUE)
	)[[1]] # Subset [[1]] because one_string has length 1
	# But if there are multiple matches, string_to_sub could still have length > 1
	string_to_sub <- match_mat[1, , drop = TRUE]
	version_varnames <- match_mat[2, , drop = TRUE]
	version_values <- possible_values[version_varnames]
	bash_special_cases <- match_mat[3, , drop = TRUE]
	version_values = ifelse(
	bash_special_cases == "", version_values, ifelse(
	bash_special_cases == ":1", substring(version_values, 2), ifelse(
	bash_special_cases == "//./_", gsub(".", "_", version_values, fixed = TRUE),
	NA_character_ # otherwise
	)))
	num_to_sub <- length(string_to_sub)
	stopifnot(
	all(version_varnames %in% names(possible_values)),
	!anyNA(version_values),
	num_to_sub >= 1,
	num_to_sub < 10 # Something has gone wrong if we're doing 10+
	)
	out <- one_string
	for (idx in seq_len(num_to_sub)) {
	# not gsub in case there are duplicates
	out <- sub(string_to_sub[idx], version_values[idx], out, fixed = TRUE)
	}
	out
	}


	..substitute_all <- function(deps_unsubstituted, possible_values) {
	file_substituted = vapply(
	deps_unsubstituted$filenames,
	..substitute_like_bash,
	FUN.VALUE = character(1),
	possible_values = possible_values
	)
	url_substituted = vapply(
	deps_unsubstituted$urls,
	..substitute_like_bash,
	FUN.VALUE = character(1),
	possible_values = possible_values
	)
	list(
	filenames = unname(file_substituted),
	urls = unname(url_substituted)
	)
	}


	..parse_version_lines <- function(version_lines) {
	version_lines <- trimws(version_lines)
	version_regex <- "^(ARROW_[A-Z0-9_]+_)(VERSION\|SHA256_CHECKSUM)=([^=]+)$"
	if (!all(grepl(version_regex, version_lines, perl=TRUE))) {
	stop("Failed to parse version lines")
	}

	match_list <- regmatches(
	version_lines,
	regexec(version_regex, version_lines, perl = TRUE)
	)
	# Find the lines where the second regex match group is that are "VERSION" (as
	# opposed to "SHA256_CHECKSUM")
	version_idx <- vapply(
	match_list,
	function(m) m[[3]] == "VERSION",
	FUN.VALUE = logical(1)
	)
	version_matches = match_list[version_idx]
	# Fancy indexing here is just to pull the first and second regex match out,
	# e.g. "ARROW_RAPIDJSON_BUILD_" and "VERSION"
	version_varnames = vapply(
	version_matches,
	function(m) paste0(m[[2]], m[[3]]),
	FUN.VALUE = character(1)
	)
	version_values <- vapply(version_matches,
	function(m) m[[4]],
	FUN.VALUE = character(1)
	)
	names(version_values) <- version_varnames
	return(version_values)
	}


	..parse_dependency_array <- function(array_lines) {
	stopifnot(
	length(array_lines) >= 1,
	is.character(array_lines),
	!anyNA(array_lines)
	)
	array_lines <- trimws(array_lines)

	# Parse the array_lines with a regex. Each line of the array is a different
	# component, e.g.
	# `"ARROW_RAPIDJSON_URL rapidjson-${ARROW_RAPIDJSON_BUILD_VERSION}.tar.gz https://github.com/miloyip/rapidjson/archive/${ARROW_RAPIDJSON_BUILD_VERSION}.tar.gz"`
	# The first element is the variable name of the URL. This matters for cmake,
	# but not here. The second is the filename that will be saved (no directory).
	# The third is the URL, including some version string that's defined earlier
	# in the file.
	# Regex in words:
	# Start with `"ARROW_`, then any capital ASCII letter, number, or underscore.
	# After a space, find anything except a space, colon, or forward slash. (No
	# space is essential, and would be essential to parsing the array in bash.
	# The colon and slash are just basic guards that this is a filename.) Next, a
	# space. Then a URL, starting with https://, and including anything except a
	# space. (This is the URL before substituting in the version sting, so normal
	# URL parsing rules don't apply.)
	dep_array_regex <- '^"(ARROW_[A-Z0-9_]+_URL) ([^ :/"]+) (https://[^ "]+)"$'
	if (!all(grepl(dep_array_regex, array_lines, perl = TRUE))) {
	stop("Cannot parse thirdparty dependency array in expected format.")
	}
	list(
	filenames = gsub(dep_array_regex, "\\2", array_lines, perl = TRUE),
	urls = gsub(dep_array_regex, "\\3", array_lines, perl = TRUE)
	)
	}


	..parse_lines <- function(versions_file) {
	orig_lines <- readLines(versions_file)

	lines <- gsub("#.*", "", orig_lines, perl=TRUE)
	lines <- lines[lines != ""]

	dep_array_start_idx <- grep("^DEPENDENCIES=\\($", lines, perl=TRUE)

	dep_array_lines <- lines[
	seq.int(from = dep_array_start_idx + 1, to = length(lines) - 1, by = 1)
	]

	version_lines <- lines[seq.int(1, dep_array_start_idx - 1, by=1)]
	version_info = ..parse_version_lines(version_lines)

	failed_to_parse <- anyNA(orig_lines) \|\|
	length(orig_lines) > 1000 \|\|
	length(lines) == 0 \|\|
	length(dep_array_start_idx) != 1 \|\|
	dep_array_start_idx <= 1 \|\|
	dep_array_start_idx >= length(lines) - 3 \|\|
	lines[length(lines)] != ")" \|\|
	length(dep_array_lines) == 0 \|\|
	anyNA(version_info)

	if (failed_to_parse) {
	stop(
	"Failed to parse 3rd party dependency file. It's possible the function ",
	"is not reading the correct file or the file formatting was not what ",
	"was expected.",
	call. = FALSE
	)
	}
	deps_unsubstituted <- ..parse_dependency_array(dep_array_lines)
	..substitute_all(deps_unsubstituted, possible_values = version_info)
	}


	..download_dependencies <- function(dep_info, download_dir) {
	stopifnot(
	length(dep_info$urls) == length(dep_info$filenames),
	length(dep_info$urls) > 0
	)
	download_dir <- normalizePath(download_dir, winslash = "/", mustWork = TRUE)
	full_filenames <- file.path(download_dir, dep_info$filenames, fsep = "/")
	# Using libcurl here is well supported in R, but is a different download
	# engine than the wget in download_dependencies.sh
	# libcurl is required for supplying multiple URLs, and is available in all
	# CRAN builds, but isn't guaranteed.
	download.file(dep_info$urls, full_filenames, method = "libcurl", quiet = TRUE)
	}



	library(testthat)
	test_that("bash-equivalent substitution works", {
	vals = c(ARROW_ZZZ_VERSION = "c.2", ARROW_AAA_VERSION = "v7")
	expect_equal(
	..substitute_like_bash("x ${ARROW_ZZZ_VERSION} _", vals), "x c.2 _"
	)
	expect_equal(
	..substitute_like_bash("https://example.com/${ARROW_AAA_VERSION:1}", vals),
	"https://example.com/7"
	)
	expect_equal(
	..substitute_like_bash("x ${ARROW_ZZZ_VERSION//./_} .", vals), "x c_2 ."
	)
	})
	# Memoize for demo/testing:
	if (!memoise::is.memoized(download.file)) {
	download.file <- memoise::memoize(utils::download.file)
	}

	# Could use your local version instead.
	versions_file <- "https://raw.githubusercontent.com/apache/arrow/master/cpp/thirdparty/versions.txt"
	dep_list <- ..parse_lines(versions_file)

	download_dir <- file.path(tempdir(), "thirdparty")
	dir.create(download_dir)
	..download_dependencies(dep_list, download_dir)
	list.files(download_dir)

	```