Created
March 8, 2018 17:08
-
-
Save mrdwab/8bfaa6198040e4e6608b006e3f6a8d2f to your computer and use it in GitHub Desktop.
Faster versions of `cSplit_e`, `numMat`, and `charMat`.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cSplit_e_new <- function (indt, splitCols, sep = ",", mode = "binary", type = "numeric", | |
drop = FALSE, fixed = TRUE, fill = NULL) { | |
indt <- setDT(copy(indt)) | |
if (is.numeric(splitCols)) splitCols <- names(indt)[splitCols] | |
if (length(sep) == 1) sep <- rep(sep, length(splitCols)) | |
if (length(sep) != length(splitCols)) stop("Wrong number of sep supplied") | |
if (length(mode) == 1) mode <- rep(mode, length(splitCols)) | |
if (length(mode) != length(mode)) stop("Wrong number of mode supplied") | |
if (any(!mode %in% c("binary", "value", "count"))) { | |
stop("Mode must be `binary`, `value`, or `count`") | |
} | |
if (length(type) == 1) type <- rep(type, length(splitCols)) | |
if (length(type) != length(type)) stop("Wrong number of type supplied") | |
for (i in seq_along(splitCols)) { | |
a <- strsplit(as.character(indt[[splitCols[i]]]), sep[i], fixed = fixed) | |
TYPE <- type[i] | |
if (TYPE == "character") a <- trim_list(a) | |
temp <- switch( | |
TYPE, | |
numeric = numMat(a, mode = mode[i], fill = fill), | |
character = charMat(a, mode = mode[i], fill = fill), | |
stop("type must be numeric or character")) | |
temp <- as.data.table(temp) | |
NAMES <- if (TYPE == "character") { | |
names(temp) | |
} else { | |
sprintf("%s_%s", splitCols[i], seq_along(temp)) | |
} | |
set(indt, j = NAMES, value = temp) | |
if (isTRUE(drop)) set(indt, j = splitCols[i], value = NULL) | |
} | |
indt[] | |
} | |
numMat <- function (listOfValues, mode = "binary", fill = NULL) { | |
len <- length(listOfValues) | |
vec <- as.integer(unlist(listOfValues, use.names = FALSE)) | |
slvl <- seq(min(vec, na.rm = TRUE), max(vec, na.rm = TRUE)) | |
i.idx <- rep(seq_len(len), lengths(listOfValues)) | |
if (mode %in% c("binary", "value")) { | |
if (is.null(fill)) { | |
fill <- if (mode == "binary") 0L else NA_integer_ | |
} | |
out <- matrix(as.integer(fill), nrow = len, ncol = length(slvl), | |
dimnames = list(NULL, slvl)) | |
j.idx <- match(vec, slvl) | |
out[na.omit(cbind(i.idx, j.idx))] <- switch( | |
mode, binary = 1L, value = na.omit(vec), | |
stop("'mode' must be 'binary' or 'value'")) | |
} else if (mode == "count") { | |
out <- unclass(table(i.idx, factor(vec, slvl))) | |
if (!is.null(fill)) { | |
out <- replace(out, out == 0, fill) | |
} | |
} | |
out | |
} | |
charMat <- function (listOfValues, mode = "binary", fill = NULL) { | |
len <- length(listOfValues) | |
vec <- unlist(listOfValues, use.names = FALSE) | |
lvl <- sort(unique(vec)) | |
i.idx <- rep(seq.int(len), lengths(listOfValues)) | |
if (mode %in% c("binary", "value")) { | |
fill <- if (is.null(fill)) { | |
if (mode == "binary") 0L else NA_integer_ | |
} else { | |
fill | |
} | |
out <- matrix(as.integer(fill), nrow = len, ncol = length(lvl), | |
dimnames = list(NULL, lvl)) | |
j.idx <- match(vec, lvl) | |
out[na.omit(cbind(i.idx, j.idx))] <- switch( | |
mode, binary = 1L, value = na.omit(vec), | |
stop("'mode' must be 'binary' or 'value'")) | |
} else if (mode == "count") { | |
out <- unclass(table(i.idx, factor(vec, lvl))) | |
if (!is.null(fill)) { | |
out <- replace(out, out == 0, fill) | |
} | |
} | |
out | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment