Skip to content

Instantly share code, notes, and snippets.

@knapply
Created July 18, 2020 15:47
Show Gist options
  • Save knapply/5d4bb3496ccae0d955a1c821fd84f964 to your computer and use it in GitHub Desktop.
Save knapply/5d4bb3496ccae0d955a1c821fd84f964 to your computer and use it in GitHub Desktop.
.file_extension <- function(x, dot = TRUE, ignore_zip_ext = FALSE) {
if (ignore_zip_ext) {
base_name <- sub("\\.[bgx]z2?$", "", basename(x))
} else {
base_name <- basename(x)
}
captures <- regexpr("(?<!^|[.]|/)[.]([^.]+)$", base_name, perl = TRUE)
out <- rep(NA_character_, length(x))
out[captures > 0L] <- substring(base_name[captures > 0L], captures[captures > 0L])
if (dot) out else substring(out, 2L)
}
.url_prefix <- function(x) {
vapply(x, function(.x) {
if (substring(.x, 1L, 8L) == "https://") {
"https://"
} else if ((prefix <- substring(.x, 1L, 7L)) %in% c("http://", "ftps://", "file://")) {
prefix
} else if (substring(.x, 1L, 6L) == "ftp://") {
"ftp://"
} else {
NA_character_
}
}, character(1L), USE.NAMES = FALSE)
}
.diagnose_input <- function(x, diagnose_type = TRUE) {
init <- list(
input = x,
url_prefix = .url_prefix(x),
file_ext = .file_extension(x)
)
init$compressed <- init$file_ext %in% c(".gz", ".bz", ".bz2", ".xz")
if (diagnose_type) {
if (!anyNA(init$url_prefix)) {
init$type <- "url"
} else if (!anyNA(init$file_ext)) {
init$type <- "file"
} else {
init$type <- "text"
}
}
structure(init, class = "data.frame", row.names = seq_along(x))
}
fparse <- function(input = NULL,
json_pointer = "",
input_type = c("auto", "text", "file", "url"),
empty_array = NULL,
empty_object = NULL,
max_simplify_lvl = c("data_frame", "matrix", "vector", "none"),
type_policy = c("anything_goes", "numbers", "strict"),
int64_opt = c("double", "string", "integer64"),
verbose = FALSE,
temp_dir = tempdir(),
keep_temp_files = FALSE) {
# validate arguments =========================================================
# types ----------------------------------------------------------------------
if (!is.character(json_pointer) || is.na(json_pointer) || length(json_pointer) != 1L) {
stop("`json_pointer=` must be a single, non-`NA` `character`.")
}
if (!is.character(input)) {
stop("`input=` must be a `character`.")
}
if (any(is.na(input)) || any(nchar(input) == 0L)) {
stop("`input=` contains `NA`s or empty strings.")
}
if (!dir.exists(temp_dir)) {
stop("`temp_dir=` does not exist.")
}
# prep options ===============================================================
# max_simplify_lvl -----------------------------------------------------------
if (!is.character(max_simplify_lvl) && !is.numeric(max_simplify_lvl)) {
stop("`max_simplify_lvl` must be of type `character` or `numeric`.")
}
if (is.numeric(max_simplify_lvl)) {
stopifnot(max_simplify_lvl %in% 0:3)
} else { # (is.character(max_simplify_lvl)) {
max_simplify_lvl <- switch(
match.arg(max_simplify_lvl, c("data_frame", "matrix", "vector", "none")),
data_frame = 0L,
matrix = 1L,
vector = 2L,
none = 3L,
stop("Unknown `max_simplify_lvl` argument.")
)
}
# type_policy ----------------------------------------------------------------
if (!is.character(type_policy) && !is.numeric(type_policy)) {
stop("`type_policy` must be of type `character` or `numeric`.")
}
if (is.numeric(type_policy)) {
stopifnot(max_simplify_lvl %in% 0:2)
} else { # if (is.character(type_policy)) {
type_policy <- switch(
match.arg(type_policy, c("anything_goes", "numbers", "strict")),
anything_goes = 0L,
numbers = 1L,
strict = 2L,
stop("Unknown `type_policy` argument.")
)
}
# int64_opt ------------------------------------------------------------------
if (!is.character(int64_opt) && !is.numeric(int64_opt)) {
stop("`int64_opt` must be of type `character` or `numeric`.")
}
if (is.numeric(int64_opt)) {
stopifnot(int64_opt %in% 0:2)
} else { # if (is.character(int64_opt)) {
int64_opt <- switch(
match.arg(int64_opt, c("double", "string", "integer64")),
double = 0L,
string = 1L,
integer64 = 2L,
stop("Unknown `int64_opt` argument.")
)
}
if (int64_opt == 2L && !requireNamespace("bit64", quietly = TRUE)) {
stop('`int64_opt = "integer64", but the {bit64} package is not installed.')
}
# diagnose input_type ========================================================
input_type <- match.arg(input_type, c("auto", "text", "file", "url"))
# auto -----------------------------------------------------------------------
if (input_type == "auto") {
if (any(substring(input, 1L, 1L) %in% c(" ", "{", "[", '"')) || any(substring(input, 1L, 4L) == "null")) {
input_type <- "text"
} else {
diagnosis <- .diagnose_input(input)
input_type <- unique(diagnosis$type)
if (length(input_type) != 1L) {
stop ("`input` should all be of the same `input_type`. Types detected:",
sprintf("\n\t- %s", input_type))
}
}
}
# url ------------------------------------------------------------------------
if (input_type == "url") {
for (i in seq_input) {
temp_file <- tempfile(fileext = diagnosis$file_ext[[i]], tmpdir = temp_dir)
switch(
diagnosis$url_prefix[[i]],
"https://" = ,
"ftps://" = ,
"http://" = ,
"ftp://" = download.file(diagnosis$input[[i]], destfile = temp_file, method = getOption("download.file.method", default = "auto"), quiet = !verbose),
"file://" = download.file(diagnosis$input[[i]], destfile = temp_file, method = "internal", quiet = !verbose),
stop("Unknown URL prefix")
)
diagnosis$input[[i]] <- temp_file
diagnosis$type[[i]] <- "file"
}
input_type <- unique(diagnosis$type)
stopifnot(length(input_type) == 1L)
if (!keep_temp_files) {
on.exit(unlink(diagnosis$input), add = TRUE)
}
}
# file -----------------------------------------------------------------------
input_decompressed <- FALSE
if (input_type == "file") {
if (any(diagnosis$compressed)) { # temporary... this can be done w/o materializing R strings in C++ for at least .gz, and Suggests to support others (?)
.input <- vector("character", length = length(input))
input_decompressed <- TRUE
if (verbose) message("Compressed files found. Decompressing...")
for (i in seq_input) {
if (diagnosis$compressed[[i]]) {
decomp_type <- switch(
diagnosis$file_ext[[i]],
".gz" = "gzip",
".bz" = ,
".bz2" = "bzip2",
".xz" = "xz"
,
"unknown"
)
con <- file(diagnosis$input[[i]], open = "rb")
raw_vec <- readBin(con, what = "raw", n = file.size(diagnosis$input[[i]]))
close(con)
.input[[i]] <- memDecompress(raw_vec, type = decomp_type, asChar = TRUE)
}
}
input_type <- "text"
} else {
diagnosis$input <- Sys.glob(diagnosis$input)
}
}
# set names ==================================================================
if (input_type != "text") {
.input <- diagnosis$input
}
if (input_type != "text" || input_decompressed) {
if (length(names(input))) {
names(.input) <- names(input)
} else {
names(.input) <- basename(input)
}
}
# deserialize ================================================================
switch(
input_type,
"text" = RcppSimdJson:::.deserialize_json(
json = if (input_decompressed) .input else input,
json_pointer = json_pointer,
empty_array = empty_array,
empty_object = empty_object,
simplify_to = max_simplify_lvl,
type_policy = type_policy,
int64_r_type = int64_opt
),
"file" = RcppSimdJson:::.load_json(
file_path = .input,
json_pointer = json_pointer,
empty_array = empty_array,
empty_object = empty_object,
simplify_to = max_simplify_lvl,
type_policy = type_policy,
int64_r_type = int64_opt
)
,
stop("Unknown `input_type`.")
)
}
# files <- dir("~/Documents/rcppsimdjson/inst/jsonexamples/", pattern = "\\.json$", full.names = TRUE, recursive = TRUE)
#
# urls <- c(
# "https://raw.githubusercontent.com/eddelbuettel/rcppsimdjson/master/inst/jsonexamples/apache_builds.json",
# "https://raw.githubusercontent.com/eddelbuettel/rcppsimdjson/master/inst/jsonexamples/mesh.json",
# "https://raw.githubusercontent.com/eddelbuettel/rcppsimdjson/master/inst/jsonexamples/citm_catalog.json",
# "https://raw.githubusercontent.com/eddelbuettel/rcppsimdjson/master/inst/jsonexamples/canada.json",
# "https://raw.githubusercontent.com/eddelbuettel/rcppsimdjson/master/inst/jsonexamples/twitter.json",
# "https://raw.githubusercontent.com/eddelbuettel/rcppsimdjson/master/inst/jsonexamples/github_events.json",
# "https://raw.githubusercontent.com/eddelbuettel/rcppsimdjson/master/inst/jsonexamples/gsoc-2018.json"
# )
#
# gz_files <- sapply(
# files[1:10],
# function(.x) {
# R.utils::compressFile(
# .x, remove = FALSE, FUN = gzfile, ext = "gz",
# destname = sprintf("%s/%s%s", tempdir(), basename(.x), ".gz")
# )
# }, USE.NAMES = FALSE
# )
#
# json_text <- c("[1,2,3]",'[4,5,6]')
# fparse(json_text)
#
# parsed_files <- fparse(files)
# names(parsed_files)
#
# download_and_parse_files <- fparse(urls)
# names(download_and_parse_files)
#
# inflate_and_parse <- fparse(gz_files)
# names(inflate_and_parse)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment