Skip to content

Instantly share code, notes, and snippets.

@mrdwab
Last active March 10, 2018 11:39
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mrdwab/a61efe8c201995633f30c7d7eaed40d4 to your computer and use it in GitHub Desktop.
Save mrdwab/a61efe8c201995633f30c7d7eaed40d4 to your computer and use it in GitHub Desktop.
Using `fread` as the workhorse for `cSplit`.
dt1 <- fread("V1 V2 V3
x b;c;d 1
y d;ef 2
z d;ef 3
m tmp 4
n tmp 5
n x;yz 5")
dt1[4, V2:=''] # this record will be lost because V2 value is empty string
dt1[5, V2:=NA_character_] # NA value is processed correctly
# Expect 11 x 3 data.table. Empty values in rows 8 & 9
cSplit_fread(dt1, "V2", ";", "long")
# Expect 18 x 3 data.table, 3 rows for each original row
cSplit_fread(dt1, "V2", ";", "long", makeEqual = TRUE)
dt2 <- fread("V1 V2 V3
x xA;xB;xC x1;x2;x3
y yD y1
z zF;zG z1")
# Expect 6 x 3, NA at [6, 3]
cSplit_fread(dt2, c("V2", "V3"), ";")
DT <- structure(list(Name = structure(
c(2L, 1L, 3L), .Label = c("Marcia", "Marcos", "Miguel"), class = "factor"),
Likes = structure(1:3, .Label = c("1,2,4,5,6", "1,2,5", "1,2,5,6"), class = "factor"),
Siblings = structure(c(3L, 1L, 2L), .Label = c("", "Parks , Ada", "Scott "), class = "factor"),
Hates = structure(c(2L, 1L, 3L), .Label = c("1;2;4;", "1;3;4;", "3;"), class = "factor")),
.Names = c("Name", "Likes", "Siblings", "Hates"), row.names = c(39L, 44L, 36L), class = "data.frame")
setDT(DT)
# Expect 1 + 5 + 2 + 4 columns
cSplit_fread(DT, c("Likes", "Siblings", "Hates"), c(",", ",", ";"))
# Expect 1 + 5 + 2 + 4 columns
# Expect "Siblings" to have extra whitespace
cSplit_fread(DT, c("Likes", "Siblings", "Hates"), c(",", ",", ";"), stripWhite = FALSE)
# Expect 1 + 5 + 5 + 5 columns
cSplit_fread(DT, c("Likes", "Siblings", "Hates"), c(",", ",", ";"), makeEqual = TRUE)
# Expect 12 x 4 data.table
cSplit_fread(DT, c("Likes", "Siblings", "Hates"), c(",", ",", ";"), "long")
# Expect 15 x 4 data.table -- like melt(cSplit_fread(...), patterns(splitcols))
cSplit_fread(DT, c("Likes", "Siblings", "Hates"), c(",", ",", ";"), "long")
cSplit_fread <- function(indt, splitCols, sep = ",", direction = "wide",
fixed = TRUE, drop = TRUE, stripWhite = TRUE,
makeEqual = NULL, type.convert = TRUE) {
o_names <- names(indt)
indt <- setDT(copy(indt))
if (is.numeric(splitCols)) splitCols <- names(indt)[splitCols]
if (length(sep) == 1) sep <- rep(sep, length(splitCols))
if (length(sep) != length(splitCols)) stop("Wrong number of sep supplied")
for (i in seq_along(splitCols)) {
if (isTRUE(fixed)) {
temp <- f_split(indt[[splitCols[i]]], sep[i], fixed, stripWhite, type.convert)
set(indt, j = sprintf("%s_%d", splitCols[i], seq_along(temp)), value = temp)
if (isTRUE(drop)) set(indt, j = splitCols[i], value = NULL)
} else {
temp <- t_split(indt[[splitCols[i]]], sep[i], fixed, stripWhite, type.convert)
set(indt, j = sprintf("%s_%d", splitCols[i], seq_along(temp)), value = temp)
if (isTRUE(drop)) set(indt, j = splitCols[i], value = NULL)
}
}
if (direction == "wide") {
if (isTRUE(makeEqual)) make_equal(indt, splitCols)[] else indt[]
} else if (direction == "long") {
indt[, ._ID1 := .I]
out <- set(suppressWarnings(
melt(make_equal(indt, splitCols), measure = patterns(splitCols),
value.name = splitCols)), j = "variable", value = NULL)
out[, ._ID2 := sequence(.N), ._ID1]
setorderv(out, c("._ID1", "._ID2"), na.last = TRUE)
out <- if (isTRUE(makeEqual)) out else out[!(long_fixer(out, splitCols) & ._ID2 != 1)]
set(out, j = c("._ID1", "._ID2"), value = NULL)
setcolorder(out, o_names)[]
}
}
long_fixer <- function(indt, cols) {
temp <- rowSums(is.na(indt[, ..cols])) + rowSums(indt[, ..cols] == "", na.rm = TRUE)
temp == length(cols)
}
make_equal <- function(indt, splitCols) {
check <- all_names(names(indt), stubs = splitCols, end_stub = FALSE)
if (length(check[["miss"]]) > 0) {
nat <- vapply(unname(check[["stubs"]]), function(x) {
typeof(indt[[grep(x, names(indt), fixed = TRUE)[[1]]]])
}, character(1L))
for (i in seq_along(nat)) {
COLS <- grep(names(nat[i]), check[["miss"]], value = TRUE, fixed = TRUE)
if (length(COLS) > 0) indt[, (COLS) := NA_type(nat[i])]
}
}
setcolorder(indt, check[["full_names"]])
}
NA_type <- function(string) {
switch(string,
double = NA_real_,
integer = NA_integer_,
complex = NA_complex_,
character = NA_character_,
NA)
}
all_names <- function(current_names, stubs, end_stub = FALSE) {
stub_names <- grep(paste(stubs, collapse = "|"), current_names, value = TRUE)
id_names <- grep(paste(stubs, collapse = "|"), current_names, value = TRUE, invert = TRUE)
levs <- unique(gsub(paste(stubs, collapse = "|"), "", stub_names))
stub_levs <- gsub("^\\^|\\$$", "",
do.call(paste0, expand.grid(stubs, levs)[if (end_stub) 2:1 else 1:2]))
full_names <- c(id_names, stub_levs[order(stub_levs)])
levs <- gsub("^[[:punct:]]|[[:punct:]]$", "", levs)
levs <- levs[order(levs)]
miss <- setdiff(full_names, current_names)
list(stubs = stubs,
stub_names = stub_names,
id_names = id_names,
levs = levs,
stub_levs = stub_levs,
full_names = full_names,
miss = miss)
}
t_split <- function(vec, sep, fixed = TRUE, stripWhite = TRUE, type.convert = TRUE) {
temp <- tstrsplit(vec, sep, fixed)
if (!isTRUE(stripWhite) & isTRUE(type.convert)) {
message("type.convert requires stripWhite = TRUE. Changing setting.")
}
if (isTRUE(stripWhite)) temp <- trim_list(temp, convert = type.convert)
setDT(temp)[]
}
f_split <- function(vec, sep, fixed = TRUE, stripWhite = TRUE, type.convert = TRUE) {
if ((sep == "") | (nchar(sep) > 1L) | (!isTRUE(fixed))) {
temp <- t_split(vec, sep, fixed, stripWhite, type.convert)
} else {
VEC <- as.character(vec)
ana <- if (anyNA(VEC)) is.na(VEC) else NULL
anb <- !nzchar(VEC)
if (is.null(ana) & !any(anb)) {
VEC <- stri_flatten(VEC, collapse = "\n")
} else {
if (!is.null(ana)) VEC[which(ana)] <- sep
if (any(anb)) VEC[which(anb)] <- sep
VEC <- stri_flatten(VEC, collapse = "\n")
}
temp <- fread(VEC, sep = sep, fill = TRUE,
blank.lines.skip = FALSE, header = FALSE,
colClasses = if (!isTRUE(type.convert)) "character" else NULL,
strip.white = stripWhite, logical01 = FALSE)[, lapply(
.SD, function(x) replace(x, x == "", NA))]
if (length(temp) == 1L) {
temp <- t_split(vec, sep, fixed, stripWhite, type.convert)
}
}
temp
}
## TESTS -------------------
cSplit_fread(concat.test, c("Likes", "Hates", "Siblings"), c(",", ";", ","))
cSplit_fread(concat.test, c("Likes", "Hates", "Siblings"), c(",", ";", ","))
cSplit_fread(concat.test, c("Likes", "Hates", "Siblings")) ## misses the ";" separated data
cSplit_fread(concat.test, c("Likes", "Hates", "Siblings"), c(",", ";", ","), makeEqual = TRUE)
cSplit_fread(concat.test, c("Likes", "Hates", "Siblings"), c(",", ";", ","), direction = "long")
cSplit_fread(concat.test, c("Likes", "Hates", "Siblings"), c(",", ";", ","), direction = "long", makeEqual = FALSE)
DT <- as.data.table(concat.test)
DT_1000 <- rbindlist(replicate(ceiling(1000/nrow(DT)), DT, FALSE))
DT_100K <- rbindlist(replicate(ceiling(100000/nrow(DT_1000)), DT_1000, FALSE))
DT_10M <- rbindlist(replicate(100, DT_100K, FALSE))
@mrdwab
Copy link
Author

mrdwab commented Mar 8, 2018

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment