Created
July 18, 2020 15:47
-
-
Save knapply/5d4bb3496ccae0d955a1c821fd84f964 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.file_extension <- function(x, dot = TRUE, ignore_zip_ext = FALSE) { | |
if (ignore_zip_ext) { | |
base_name <- sub("\\.[bgx]z2?$", "", basename(x)) | |
} else { | |
base_name <- basename(x) | |
} | |
captures <- regexpr("(?<!^|[.]|/)[.]([^.]+)$", base_name, perl = TRUE) | |
out <- rep(NA_character_, length(x)) | |
out[captures > 0L] <- substring(base_name[captures > 0L], captures[captures > 0L]) | |
if (dot) out else substring(out, 2L) | |
} | |
.url_prefix <- function(x) { | |
vapply(x, function(.x) { | |
if (substring(.x, 1L, 8L) == "https://") { | |
"https://" | |
} else if ((prefix <- substring(.x, 1L, 7L)) %in% c("http://", "ftps://", "file://")) { | |
prefix | |
} else if (substring(.x, 1L, 6L) == "ftp://") { | |
"ftp://" | |
} else { | |
NA_character_ | |
} | |
}, character(1L), USE.NAMES = FALSE) | |
} | |
.diagnose_input <- function(x, diagnose_type = TRUE) { | |
init <- list( | |
input = x, | |
url_prefix = .url_prefix(x), | |
file_ext = .file_extension(x) | |
) | |
init$compressed <- init$file_ext %in% c(".gz", ".bz", ".bz2", ".xz") | |
if (diagnose_type) { | |
if (!anyNA(init$url_prefix)) { | |
init$type <- "url" | |
} else if (!anyNA(init$file_ext)) { | |
init$type <- "file" | |
} else { | |
init$type <- "text" | |
} | |
} | |
structure(init, class = "data.frame", row.names = seq_along(x)) | |
} | |
fparse <- function(input = NULL, | |
json_pointer = "", | |
input_type = c("auto", "text", "file", "url"), | |
empty_array = NULL, | |
empty_object = NULL, | |
max_simplify_lvl = c("data_frame", "matrix", "vector", "none"), | |
type_policy = c("anything_goes", "numbers", "strict"), | |
int64_opt = c("double", "string", "integer64"), | |
verbose = FALSE, | |
temp_dir = tempdir(), | |
keep_temp_files = FALSE) { | |
# validate arguments ========================================================= | |
# types ---------------------------------------------------------------------- | |
if (!is.character(json_pointer) || is.na(json_pointer) || length(json_pointer) != 1L) { | |
stop("`json_pointer=` must be a single, non-`NA` `character`.") | |
} | |
if (!is.character(input)) { | |
stop("`input=` must be a `character`.") | |
} | |
if (any(is.na(input)) || any(nchar(input) == 0L)) { | |
stop("`input=` contains `NA`s or empty strings.") | |
} | |
if (!dir.exists(temp_dir)) { | |
stop("`temp_dir=` does not exist.") | |
} | |
# prep options =============================================================== | |
# max_simplify_lvl ----------------------------------------------------------- | |
if (!is.character(max_simplify_lvl) && !is.numeric(max_simplify_lvl)) { | |
stop("`max_simplify_lvl` must be of type `character` or `numeric`.") | |
} | |
if (is.numeric(max_simplify_lvl)) { | |
stopifnot(max_simplify_lvl %in% 0:3) | |
} else { # (is.character(max_simplify_lvl)) { | |
max_simplify_lvl <- switch( | |
match.arg(max_simplify_lvl, c("data_frame", "matrix", "vector", "none")), | |
data_frame = 0L, | |
matrix = 1L, | |
vector = 2L, | |
none = 3L, | |
stop("Unknown `max_simplify_lvl` argument.") | |
) | |
} | |
# type_policy ---------------------------------------------------------------- | |
if (!is.character(type_policy) && !is.numeric(type_policy)) { | |
stop("`type_policy` must be of type `character` or `numeric`.") | |
} | |
if (is.numeric(type_policy)) { | |
stopifnot(max_simplify_lvl %in% 0:2) | |
} else { # if (is.character(type_policy)) { | |
type_policy <- switch( | |
match.arg(type_policy, c("anything_goes", "numbers", "strict")), | |
anything_goes = 0L, | |
numbers = 1L, | |
strict = 2L, | |
stop("Unknown `type_policy` argument.") | |
) | |
} | |
# int64_opt ------------------------------------------------------------------ | |
if (!is.character(int64_opt) && !is.numeric(int64_opt)) { | |
stop("`int64_opt` must be of type `character` or `numeric`.") | |
} | |
if (is.numeric(int64_opt)) { | |
stopifnot(int64_opt %in% 0:2) | |
} else { # if (is.character(int64_opt)) { | |
int64_opt <- switch( | |
match.arg(int64_opt, c("double", "string", "integer64")), | |
double = 0L, | |
string = 1L, | |
integer64 = 2L, | |
stop("Unknown `int64_opt` argument.") | |
) | |
} | |
if (int64_opt == 2L && !requireNamespace("bit64", quietly = TRUE)) { | |
stop('`int64_opt = "integer64", but the {bit64} package is not installed.') | |
} | |
# diagnose input_type ======================================================== | |
input_type <- match.arg(input_type, c("auto", "text", "file", "url")) | |
# auto ----------------------------------------------------------------------- | |
if (input_type == "auto") { | |
if (any(substring(input, 1L, 1L) %in% c(" ", "{", "[", '"')) || any(substring(input, 1L, 4L) == "null")) { | |
input_type <- "text" | |
} else { | |
diagnosis <- .diagnose_input(input) | |
input_type <- unique(diagnosis$type) | |
if (length(input_type) != 1L) { | |
stop ("`input` should all be of the same `input_type`. Types detected:", | |
sprintf("\n\t- %s", input_type)) | |
} | |
} | |
} | |
# url ------------------------------------------------------------------------ | |
if (input_type == "url") { | |
for (i in seq_input) { | |
temp_file <- tempfile(fileext = diagnosis$file_ext[[i]], tmpdir = temp_dir) | |
switch( | |
diagnosis$url_prefix[[i]], | |
"https://" = , | |
"ftps://" = , | |
"http://" = , | |
"ftp://" = download.file(diagnosis$input[[i]], destfile = temp_file, method = getOption("download.file.method", default = "auto"), quiet = !verbose), | |
"file://" = download.file(diagnosis$input[[i]], destfile = temp_file, method = "internal", quiet = !verbose), | |
stop("Unknown URL prefix") | |
) | |
diagnosis$input[[i]] <- temp_file | |
diagnosis$type[[i]] <- "file" | |
} | |
input_type <- unique(diagnosis$type) | |
stopifnot(length(input_type) == 1L) | |
if (!keep_temp_files) { | |
on.exit(unlink(diagnosis$input), add = TRUE) | |
} | |
} | |
# file ----------------------------------------------------------------------- | |
input_decompressed <- FALSE | |
if (input_type == "file") { | |
if (any(diagnosis$compressed)) { # temporary... this can be done w/o materializing R strings in C++ for at least .gz, and Suggests to support others (?) | |
.input <- vector("character", length = length(input)) | |
input_decompressed <- TRUE | |
if (verbose) message("Compressed files found. Decompressing...") | |
for (i in seq_input) { | |
if (diagnosis$compressed[[i]]) { | |
decomp_type <- switch( | |
diagnosis$file_ext[[i]], | |
".gz" = "gzip", | |
".bz" = , | |
".bz2" = "bzip2", | |
".xz" = "xz" | |
, | |
"unknown" | |
) | |
con <- file(diagnosis$input[[i]], open = "rb") | |
raw_vec <- readBin(con, what = "raw", n = file.size(diagnosis$input[[i]])) | |
close(con) | |
.input[[i]] <- memDecompress(raw_vec, type = decomp_type, asChar = TRUE) | |
} | |
} | |
input_type <- "text" | |
} else { | |
diagnosis$input <- Sys.glob(diagnosis$input) | |
} | |
} | |
# set names ================================================================== | |
if (input_type != "text") { | |
.input <- diagnosis$input | |
} | |
if (input_type != "text" || input_decompressed) { | |
if (length(names(input))) { | |
names(.input) <- names(input) | |
} else { | |
names(.input) <- basename(input) | |
} | |
} | |
# deserialize ================================================================ | |
switch( | |
input_type, | |
"text" = RcppSimdJson:::.deserialize_json( | |
json = if (input_decompressed) .input else input, | |
json_pointer = json_pointer, | |
empty_array = empty_array, | |
empty_object = empty_object, | |
simplify_to = max_simplify_lvl, | |
type_policy = type_policy, | |
int64_r_type = int64_opt | |
), | |
"file" = RcppSimdJson:::.load_json( | |
file_path = .input, | |
json_pointer = json_pointer, | |
empty_array = empty_array, | |
empty_object = empty_object, | |
simplify_to = max_simplify_lvl, | |
type_policy = type_policy, | |
int64_r_type = int64_opt | |
) | |
, | |
stop("Unknown `input_type`.") | |
) | |
} | |
# files <- dir("~/Documents/rcppsimdjson/inst/jsonexamples/", pattern = "\\.json$", full.names = TRUE, recursive = TRUE) | |
# | |
# urls <- c( | |
# "https://raw.githubusercontent.com/eddelbuettel/rcppsimdjson/master/inst/jsonexamples/apache_builds.json", | |
# "https://raw.githubusercontent.com/eddelbuettel/rcppsimdjson/master/inst/jsonexamples/mesh.json", | |
# "https://raw.githubusercontent.com/eddelbuettel/rcppsimdjson/master/inst/jsonexamples/citm_catalog.json", | |
# "https://raw.githubusercontent.com/eddelbuettel/rcppsimdjson/master/inst/jsonexamples/canada.json", | |
# "https://raw.githubusercontent.com/eddelbuettel/rcppsimdjson/master/inst/jsonexamples/twitter.json", | |
# "https://raw.githubusercontent.com/eddelbuettel/rcppsimdjson/master/inst/jsonexamples/github_events.json", | |
# "https://raw.githubusercontent.com/eddelbuettel/rcppsimdjson/master/inst/jsonexamples/gsoc-2018.json" | |
# ) | |
# | |
# gz_files <- sapply( | |
# files[1:10], | |
# function(.x) { | |
# R.utils::compressFile( | |
# .x, remove = FALSE, FUN = gzfile, ext = "gz", | |
# destname = sprintf("%s/%s%s", tempdir(), basename(.x), ".gz") | |
# ) | |
# }, USE.NAMES = FALSE | |
# ) | |
# | |
# json_text <- c("[1,2,3]",'[4,5,6]') | |
# fparse(json_text) | |
# | |
# parsed_files <- fparse(files) | |
# names(parsed_files) | |
# | |
# download_and_parse_files <- fparse(urls) | |
# names(download_and_parse_files) | |
# | |
# inflate_and_parse <- fparse(gz_files) | |
# names(inflate_and_parse) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment