knapply/old-rcppsimdjson-prototype.R

## old-rcppsimdjson-prototype.R
.file_extension <- function(x, dot = TRUE, ignore_zip_ext = FALSE) {
    if (ignore_zip_ext) {
        base_name <- sub("\\.[bgx]z2?$", "", basename(x))
    } else {
        base_name <- basename(x)
    }
    captures <- regexpr("(?<!^|[.]|/)[.]([^.]+)$", base_name, perl = TRUE)
    out <- rep(NA_character_, length(x))
    out[captures > 0L] <- substring(base_name[captures > 0L], captures[captures > 0L])
    if (dot) out else substring(out, 2L)
}


.url_prefix <- function(x) {
    vapply(x, function(.x) {
        if (substring(.x, 1L, 8L) == "https://") {
            "https://"
        } else if ((prefix <- substring(.x, 1L, 7L)) %in% c("http://", "ftps://", "file://")) {
            prefix
        } else if (substring(.x, 1L, 6L) == "ftp://") {
            "ftp://"
        } else {
            NA_character_
        }
    }, character(1L), USE.NAMES = FALSE)
}


.diagnose_input <- function(x, diagnose_type = TRUE) {
    init <- list(
        input = x,
        url_prefix = .url_prefix(x),
        file_ext = .file_extension(x)
    )
    init$compressed <- init$file_ext %in% c(".gz", ".bz", ".bz2", ".xz")

    if (diagnose_type) {
        if (!anyNA(init$url_prefix)) {
            init$type <- "url"
        } else if (!anyNA(init$file_ext)) {
            init$type <- "file"
        } else {
            init$type <- "text"
        }
    }
    structure(init, class = "data.frame", row.names = seq_along(x))
}


fparse <- function(input = NULL,
                   json_pointer = "",
                   input_type = c("auto", "text", "file", "url"),
                   empty_array = NULL,
                   empty_object = NULL,
                   max_simplify_lvl = c("data_frame", "matrix", "vector", "none"),
                   type_policy = c("anything_goes", "numbers", "strict"),
                   int64_opt = c("double", "string", "integer64"),
                   verbose = FALSE,
                   temp_dir = tempdir(),
                   keep_temp_files = FALSE) {
    # validate arguments =========================================================
    # types ----------------------------------------------------------------------
    if (!is.character(json_pointer) || is.na(json_pointer) || length(json_pointer) != 1L) {
        stop("`json_pointer=` must be a single, non-`NA` `character`.")
    }
    if (!is.character(input)) {
        stop("`input=` must be a `character`.")
    }
    if (any(is.na(input)) || any(nchar(input) == 0L)) {
        stop("`input=` contains `NA`s or empty strings.")
    }
    if (!dir.exists(temp_dir)) {
        stop("`temp_dir=` does not exist.")
    }
    # prep options ===============================================================
    # max_simplify_lvl -----------------------------------------------------------
    if (!is.character(max_simplify_lvl) && !is.numeric(max_simplify_lvl)) {
        stop("`max_simplify_lvl` must be of type `character` or `numeric`.")
    }

    if (is.numeric(max_simplify_lvl)) {
        stopifnot(max_simplify_lvl %in% 0:3)
    } else { # (is.character(max_simplify_lvl)) {
        max_simplify_lvl <- switch(
            match.arg(max_simplify_lvl, c("data_frame", "matrix", "vector", "none")),
            data_frame = 0L,
            matrix = 1L,
            vector = 2L,
            none = 3L,
            stop("Unknown `max_simplify_lvl` argument.")
        )
    }
    # type_policy ----------------------------------------------------------------
    if (!is.character(type_policy) && !is.numeric(type_policy)) {
        stop("`type_policy` must be of type `character` or `numeric`.")
    }

    if (is.numeric(type_policy)) {
        stopifnot(max_simplify_lvl %in% 0:2)
    } else { # if (is.character(type_policy)) {
        type_policy <- switch(
            match.arg(type_policy, c("anything_goes", "numbers", "strict")),
            anything_goes = 0L,
            numbers = 1L,
            strict = 2L,
            stop("Unknown `type_policy` argument.")
        )
    }
    # int64_opt ------------------------------------------------------------------
    if (!is.character(int64_opt) && !is.numeric(int64_opt)) {
        stop("`int64_opt` must be of type `character` or `numeric`.")
    }

    if (is.numeric(int64_opt)) {
        stopifnot(int64_opt %in% 0:2)
    } else { # if (is.character(int64_opt)) {
        int64_opt <- switch(
            match.arg(int64_opt, c("double", "string", "integer64")),
            double = 0L,
            string = 1L,
            integer64 = 2L,
            stop("Unknown `int64_opt` argument.")
        )
    }

    if (int64_opt == 2L && !requireNamespace("bit64", quietly = TRUE)) {
        stop('`int64_opt = "integer64", but the {bit64} package is not installed.')
    }
    # diagnose input_type ========================================================
    input_type <- match.arg(input_type, c("auto", "text", "file", "url"))
    # auto -----------------------------------------------------------------------
    if (input_type == "auto") {
        if (any(substring(input, 1L, 1L) %in% c(" ", "{", "[", '"')) || any(substring(input, 1L, 4L) == "null")) {
            input_type <- "text"
        } else {
            diagnosis <- .diagnose_input(input)
            input_type <- unique(diagnosis$type)
            if (length(input_type) != 1L) {
                stop ("`input` should all be of the same `input_type`. Types detected:",
                      sprintf("\n\t- %s", input_type))
            }
        }
    }
    # url ------------------------------------------------------------------------
    if (input_type == "url") {
        for (i in seq_input) {
            temp_file <- tempfile(fileext = diagnosis$file_ext[[i]], tmpdir = temp_dir)

            switch(
                diagnosis$url_prefix[[i]],
                "https://" = ,
                "ftps://" = ,
                "http://" = ,
                "ftp://" = download.file(diagnosis$input[[i]], destfile = temp_file, method = getOption("download.file.method", default = "auto"), quiet = !verbose),
                "file://" = download.file(diagnosis$input[[i]], destfile = temp_file, method = "internal", quiet = !verbose),
                stop("Unknown URL prefix")
            )

            diagnosis$input[[i]] <- temp_file
            diagnosis$type[[i]] <- "file"
        }

        input_type <- unique(diagnosis$type)
        stopifnot(length(input_type) == 1L)
        if (!keep_temp_files) {
            on.exit(unlink(diagnosis$input), add = TRUE)
        }
    }
    # file -----------------------------------------------------------------------
    input_decompressed <- FALSE
    if (input_type == "file") {
        if (any(diagnosis$compressed)) { # temporary... this can be done w/o materializing R strings in C++ for at least .gz, and Suggests to support others (?)
            .input <- vector("character", length = length(input))
            input_decompressed <- TRUE

            if (verbose) message("Compressed files found. Decompressing...")
            for (i in seq_input) {
                if (diagnosis$compressed[[i]]) {
                    decomp_type <- switch(
                        diagnosis$file_ext[[i]],
                        ".gz" = "gzip",
                        ".bz" = ,
                        ".bz2" = "bzip2",
                        ".xz" = "xz"
                        ,
                        "unknown"
                    )

                    con <- file(diagnosis$input[[i]], open = "rb")
                    raw_vec <- readBin(con, what = "raw", n = file.size(diagnosis$input[[i]]))
                    close(con)
                    .input[[i]] <- memDecompress(raw_vec, type = decomp_type, asChar = TRUE)
                }
            }
            input_type <- "text"
        } else {
            diagnosis$input <- Sys.glob(diagnosis$input)
        }
    }
    # set names ==================================================================
    if (input_type != "text") {
        .input <- diagnosis$input
    }
    if (input_type != "text" || input_decompressed) {
        if (length(names(input))) {
            names(.input) <- names(input)
        } else {
            names(.input) <- basename(input)
        }
    }
    # deserialize ================================================================
    switch(
        input_type,
        "text" = RcppSimdJson:::.deserialize_json(
            json = if (input_decompressed) .input else input,
            json_pointer = json_pointer,
            empty_array = empty_array,
            empty_object = empty_object,
            simplify_to = max_simplify_lvl,
            type_policy = type_policy,
            int64_r_type = int64_opt
        ),
        "file" = RcppSimdJson:::.load_json(
            file_path = .input,
            json_pointer = json_pointer,
            empty_array = empty_array,
            empty_object = empty_object,
            simplify_to = max_simplify_lvl,
            type_policy = type_policy,
            int64_r_type = int64_opt
        )
        ,
        stop("Unknown `input_type`.")
    )
}


# files <- dir("~/Documents/rcppsimdjson/inst/jsonexamples/", pattern = "\\.json$", full.names = TRUE, recursive = TRUE)
#
# urls <- c(
#     "https://raw.githubusercontent.com/eddelbuettel/rcppsimdjson/master/inst/jsonexamples/apache_builds.json",
#     "https://raw.githubusercontent.com/eddelbuettel/rcppsimdjson/master/inst/jsonexamples/mesh.json",
#     "https://raw.githubusercontent.com/eddelbuettel/rcppsimdjson/master/inst/jsonexamples/citm_catalog.json",
#     "https://raw.githubusercontent.com/eddelbuettel/rcppsimdjson/master/inst/jsonexamples/canada.json",
#     "https://raw.githubusercontent.com/eddelbuettel/rcppsimdjson/master/inst/jsonexamples/twitter.json",
#     "https://raw.githubusercontent.com/eddelbuettel/rcppsimdjson/master/inst/jsonexamples/github_events.json",
#     "https://raw.githubusercontent.com/eddelbuettel/rcppsimdjson/master/inst/jsonexamples/gsoc-2018.json"
# )
#
# gz_files <- sapply(
#     files[1:10],
#     function(.x) {
#         R.utils::compressFile(
#             .x, remove = FALSE, FUN = gzfile, ext = "gz",
#             destname = sprintf("%s/%s%s", tempdir(), basename(.x), ".gz")
#         )
#     }, USE.NAMES = FALSE
# )
#
# json_text <- c("[1,2,3]",'[4,5,6]')
# fparse(json_text)
#
# parsed_files <- fparse(files)
# names(parsed_files)
#
# download_and_parse_files <- fparse(urls)
# names(download_and_parse_files)
#
# inflate_and_parse <- fparse(gz_files)
# names(inflate_and_parse)
	.file_extension <- function(x, dot = TRUE, ignore_zip_ext = FALSE) {
	if (ignore_zip_ext) {
	base_name <- sub("\\.[bgx]z2?$", "", basename(x))
	} else {
	base_name <- basename(x)
	}
	captures <- regexpr("(?<!^\|[.]\|/)[.]([^.]+)$", base_name, perl = TRUE)
	out <- rep(NA_character_, length(x))
	out[captures > 0L] <- substring(base_name[captures > 0L], captures[captures > 0L])
	if (dot) out else substring(out, 2L)
	}


	.url_prefix <- function(x) {
	vapply(x, function(.x) {
	if (substring(.x, 1L, 8L) == "https://") {
	"https://"
	} else if ((prefix <- substring(.x, 1L, 7L)) %in% c("http://", "ftps://", "file://")) {
	prefix
	} else if (substring(.x, 1L, 6L) == "ftp://") {
	"ftp://"
	} else {
	NA_character_
	}
	}, character(1L), USE.NAMES = FALSE)
	}


	.diagnose_input <- function(x, diagnose_type = TRUE) {
	init <- list(
	input = x,
	url_prefix = .url_prefix(x),
	file_ext = .file_extension(x)
	)
	init$compressed <- init$file_ext %in% c(".gz", ".bz", ".bz2", ".xz")

	if (diagnose_type) {
	if (!anyNA(init$url_prefix)) {
	init$type <- "url"
	} else if (!anyNA(init$file_ext)) {
	init$type <- "file"
	} else {
	init$type <- "text"
	}
	}
	structure(init, class = "data.frame", row.names = seq_along(x))
	}


	fparse <- function(input = NULL,
	json_pointer = "",
	input_type = c("auto", "text", "file", "url"),
	empty_array = NULL,
	empty_object = NULL,
	max_simplify_lvl = c("data_frame", "matrix", "vector", "none"),
	type_policy = c("anything_goes", "numbers", "strict"),
	int64_opt = c("double", "string", "integer64"),
	verbose = FALSE,
	temp_dir = tempdir(),
	keep_temp_files = FALSE) {
	# validate arguments =========================================================
	# types ----------------------------------------------------------------------
	if (!is.character(json_pointer) \|\| is.na(json_pointer) \|\| length(json_pointer) != 1L) {
	stop("`json_pointer=` must be a single, non-`NA` `character`.")
	}
	if (!is.character(input)) {
	stop("`input=` must be a `character`.")
	}
	if (any(is.na(input)) \|\| any(nchar(input) == 0L)) {
	stop("`input=` contains `NA`s or empty strings.")
	}
	if (!dir.exists(temp_dir)) {
	stop("`temp_dir=` does not exist.")
	}
	# prep options ===============================================================
	# max_simplify_lvl -----------------------------------------------------------
	if (!is.character(max_simplify_lvl) && !is.numeric(max_simplify_lvl)) {
	stop("`max_simplify_lvl` must be of type `character` or `numeric`.")
	}

	if (is.numeric(max_simplify_lvl)) {
	stopifnot(max_simplify_lvl %in% 0:3)
	} else { # (is.character(max_simplify_lvl)) {
	max_simplify_lvl <- switch(
	match.arg(max_simplify_lvl, c("data_frame", "matrix", "vector", "none")),
	data_frame = 0L,
	matrix = 1L,
	vector = 2L,
	none = 3L,
	stop("Unknown `max_simplify_lvl` argument.")
	)
	}
	# type_policy ----------------------------------------------------------------
	if (!is.character(type_policy) && !is.numeric(type_policy)) {
	stop("`type_policy` must be of type `character` or `numeric`.")
	}

	if (is.numeric(type_policy)) {
	stopifnot(max_simplify_lvl %in% 0:2)
	} else { # if (is.character(type_policy)) {
	type_policy <- switch(
	match.arg(type_policy, c("anything_goes", "numbers", "strict")),
	anything_goes = 0L,
	numbers = 1L,
	strict = 2L,
	stop("Unknown `type_policy` argument.")
	)
	}
	# int64_opt ------------------------------------------------------------------
	if (!is.character(int64_opt) && !is.numeric(int64_opt)) {
	stop("`int64_opt` must be of type `character` or `numeric`.")
	}

	if (is.numeric(int64_opt)) {
	stopifnot(int64_opt %in% 0:2)
	} else { # if (is.character(int64_opt)) {
	int64_opt <- switch(
	match.arg(int64_opt, c("double", "string", "integer64")),
	double = 0L,
	string = 1L,
	integer64 = 2L,
	stop("Unknown `int64_opt` argument.")
	)
	}

	if (int64_opt == 2L && !requireNamespace("bit64", quietly = TRUE)) {
	stop('`int64_opt = "integer64", but the {bit64} package is not installed.')
	}
	# diagnose input_type ========================================================
	input_type <- match.arg(input_type, c("auto", "text", "file", "url"))
	# auto -----------------------------------------------------------------------
	if (input_type == "auto") {
	if (any(substring(input, 1L, 1L) %in% c(" ", "{", "[", '"')) \|\| any(substring(input, 1L, 4L) == "null")) {
	input_type <- "text"
	} else {
	diagnosis <- .diagnose_input(input)
	input_type <- unique(diagnosis$type)
	if (length(input_type) != 1L) {
	stop ("`input` should all be of the same `input_type`. Types detected:",
	sprintf("\n\t- %s", input_type))
	}
	}
	}
	# url ------------------------------------------------------------------------
	if (input_type == "url") {
	for (i in seq_input) {
	temp_file <- tempfile(fileext = diagnosis$file_ext[[i]], tmpdir = temp_dir)

	switch(
	diagnosis$url_prefix[[i]],
	"https://" = ,
	"ftps://" = ,
	"http://" = ,
	"ftp://" = download.file(diagnosis$input[[i]], destfile = temp_file, method = getOption("download.file.method", default = "auto"), quiet = !verbose),
	"file://" = download.file(diagnosis$input[[i]], destfile = temp_file, method = "internal", quiet = !verbose),
	stop("Unknown URL prefix")
	)

	diagnosis$input[[i]] <- temp_file
	diagnosis$type[[i]] <- "file"
	}

	input_type <- unique(diagnosis$type)
	stopifnot(length(input_type) == 1L)
	if (!keep_temp_files) {
	on.exit(unlink(diagnosis$input), add = TRUE)
	}
	}
	# file -----------------------------------------------------------------------
	input_decompressed <- FALSE
	if (input_type == "file") {
	if (any(diagnosis$compressed)) { # temporary... this can be done w/o materializing R strings in C++ for at least .gz, and Suggests to support others (?)
	.input <- vector("character", length = length(input))
	input_decompressed <- TRUE

	if (verbose) message("Compressed files found. Decompressing...")
	for (i in seq_input) {
	if (diagnosis$compressed[[i]]) {
	decomp_type <- switch(
	diagnosis$file_ext[[i]],
	".gz" = "gzip",
	".bz" = ,
	".bz2" = "bzip2",
	".xz" = "xz"
	,
	"unknown"
	)

	con <- file(diagnosis$input[[i]], open = "rb")
	raw_vec <- readBin(con, what = "raw", n = file.size(diagnosis$input[[i]]))
	close(con)
	.input[[i]] <- memDecompress(raw_vec, type = decomp_type, asChar = TRUE)
	}
	}
	input_type <- "text"
	} else {
	diagnosis$input <- Sys.glob(diagnosis$input)
	}
	}
	# set names ==================================================================
	if (input_type != "text") {
	.input <- diagnosis$input
	}
	if (input_type != "text" \|\| input_decompressed) {
	if (length(names(input))) {
	names(.input) <- names(input)
	} else {
	names(.input) <- basename(input)
	}
	}
	# deserialize ================================================================
	switch(
	input_type,
	"text" = RcppSimdJson:::.deserialize_json(
	json = if (input_decompressed) .input else input,
	json_pointer = json_pointer,
	empty_array = empty_array,
	empty_object = empty_object,
	simplify_to = max_simplify_lvl,
	type_policy = type_policy,
	int64_r_type = int64_opt
	),
	"file" = RcppSimdJson:::.load_json(
	file_path = .input,
	json_pointer = json_pointer,
	empty_array = empty_array,
	empty_object = empty_object,
	simplify_to = max_simplify_lvl,
	type_policy = type_policy,
	int64_r_type = int64_opt
	)
	,
	stop("Unknown `input_type`.")
	)
	}





	# files <- dir("~/Documents/rcppsimdjson/inst/jsonexamples/", pattern = "\\.json$", full.names = TRUE, recursive = TRUE)
	#
	# urls <- c(
	# "https://raw.githubusercontent.com/eddelbuettel/rcppsimdjson/master/inst/jsonexamples/apache_builds.json",
	# "https://raw.githubusercontent.com/eddelbuettel/rcppsimdjson/master/inst/jsonexamples/mesh.json",
	# "https://raw.githubusercontent.com/eddelbuettel/rcppsimdjson/master/inst/jsonexamples/citm_catalog.json",
	# "https://raw.githubusercontent.com/eddelbuettel/rcppsimdjson/master/inst/jsonexamples/canada.json",
	# "https://raw.githubusercontent.com/eddelbuettel/rcppsimdjson/master/inst/jsonexamples/twitter.json",
	# "https://raw.githubusercontent.com/eddelbuettel/rcppsimdjson/master/inst/jsonexamples/github_events.json",
	# "https://raw.githubusercontent.com/eddelbuettel/rcppsimdjson/master/inst/jsonexamples/gsoc-2018.json"
	# )
	#
	# gz_files <- sapply(
	# files[1:10],
	# function(.x) {
	# R.utils::compressFile(
	# .x, remove = FALSE, FUN = gzfile, ext = "gz",
	# destname = sprintf("%s/%s%s", tempdir(), basename(.x), ".gz")
	# )
	# }, USE.NAMES = FALSE
	# )
	#
	# json_text <- c("[1,2,3]",'[4,5,6]')
	# fparse(json_text)
	#
	# parsed_files <- fparse(files)
	# names(parsed_files)
	#
	# download_and_parse_files <- fparse(urls)
	# names(download_and_parse_files)
	#
	# inflate_and_parse <- fparse(gz_files)
	# names(inflate_and_parse)