Last active
December 30, 2015 03:59
-
-
Save asieira/7772953 to your computer and use it in GitHub Desktop.
A wrapper around function rbindlist() in R package data.table that tries to handle some of its limitations. Basically it preprocess the input parameter by filling in any missing columns with NAs, ensuring all column types match and are in the same order before passing it along to rbindlist.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A wrapper around rbindlist that ensures that all columns are matched in type and | |
# in order before calling it. | |
# | |
smartrbindlist <- function(l) { | |
if (missing(l) && !is.list(l)) { | |
stop("smartrbindlist: bad or missing argument 'l") | |
} | |
mergedColClasses = list() | |
i=1 | |
while(i <= length(l)) { | |
# Validate entry types and get column names | |
if (is.data.frame(l[[i]])) l[[i]] = as.data.table(l[[i]]) | |
if (is.data.table(l[[i]])) { | |
cn = colnames(l[[i]]) | |
} else if (is.list(l[[i]])) { | |
cn = names(l[[i]]) | |
} else { | |
if (!is.null(l[[i]])) { | |
warning(sprintf("smartrbindlist: skipping entry %i with unexpected class [%s]", | |
i, paste(class(l[[i]]), collapse=", "))) | |
} | |
cn = character() | |
} | |
if (length(cn) == 0) { | |
l[[i]] = NULL | |
next | |
} | |
# For all column names we've seen before check the classes for inconsistencies | |
match_cn = cn %chin% names(mergedColClasses) | |
for (col in cn[match_cn]) { | |
if (!isTRUE(all.equal(sort(mergedColClasses[[col]]), sort(class(l[[i]][[col]]))))) { | |
msg = sprintf("smartrbindlist: column %s has different classes in entry %i [%s] and its predecessors [%s]", | |
col, i, paste(class(l[[i]][[col]]), collapse=", "), | |
paste(mergedColClasses[[col]], collapse=", ")) | |
stop(msg) | |
} | |
} | |
# Add previously unknown column names to list of known ones. | |
mergedColClasses[cn[!match_cn]] = lapply(cn[!match_cn], function(x) class(l[[i]][[x]])) | |
i = i+1 | |
} | |
if (length(l) == 0) return(data.table()) | |
# Now we do a second pass, add any missing columns and reorder fields/columns | |
for (i in 1:length(l)) { | |
if (is.data.table(l[[i]])) { | |
cn = colnames(l[[i]]) | |
if (nrow(l[[i]]) == 0) { | |
for (col in names(mergedColClasses)[!names(mergedColClasses) %chin% cn]) { | |
l[[i]][[col]] = getNAforClass(mergedColClasses[[col]])[F] | |
} | |
} else { | |
for (col in names(mergedColClasses)[!names(mergedColClasses) %chin% cn]) { | |
l[[i]][[col]] = getNAforClass(mergedColClasses[[col]]) | |
} | |
} | |
setcolorder(l[[i]], names(mergedColClasses)) | |
} else if (is.list(l[[i]])) { | |
cn = names(l[[i]]) | |
for (col in names(mergedColClasses)[!names(mergedColClasses) %chin% cn]) { | |
l[[i]][[col]] = rep(getNAforClass(mergedColClasses[[col]]), length(l[[i]][[1]])) | |
} | |
l[[i]] = l[[i]][names(mergedColClasses)] | |
} | |
} | |
return(rbindlist(l)) | |
} | |
# Returns a single NA value for the given class (as returned by the class function). | |
# Extend this to you heart's content. :) | |
# If a type is not recognized, returns plain NA (logical). | |
# | |
getNAforClass <- function(cl) { | |
if ("numeric" %chin% cl) { | |
return(NA_real_) | |
} else if ("character" %chin% cl) { | |
return(NA_character_) | |
} else if ("factor" %chin% cl) { | |
return(as.factor(NA_character_)) | |
} else if ("int64" %chin% cl) { | |
return(as.int64(NA_real_)) | |
} else if ("integer" %chin% cl) { | |
return(NA_integer_) | |
} else if ("POSIXct" %chin% cl) { | |
return(as.POSIXct(NA_real_, origin="1970-01-01")) | |
} else { | |
return(NA) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
See the discussion that led me to write this here and here.