Skip to content

Instantly share code, notes, and snippets.

@mrdwab
Last active March 3, 2018 16:16
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mrdwab/ceded54b616bdc61cb0f to your computer and use it in GitHub Desktop.
Save mrdwab/ceded54b616bdc61cb0f to your computer and use it in GitHub Desktop.
Possible replacement for `merged.stack`. Need to figure out how to incorporate sep in here too.... See http://stackoverflow.com/a/34427860/1270695
# Should be faster than the other option here, hopefully with not too much overhead compared to `melt`
NA_type <- function(string) {
switch(string,
double = NA_real_,
integer = NA_integer_,
complex = NA_complex_,
character = NA_character_,
NA)
}
all_names <- function(current_names, stubs, end_stub = FALSE) {
stub_names <- grep(paste(stubs, collapse = "|"), current_names, value = TRUE)
id_names <- grep(paste(stubs, collapse = "|"), current_names, value = TRUE, invert = TRUE)
levs <- unique(gsub(paste(stubs, collapse = "|"), "", stub_names))
stub_levs <- gsub("^\\^|\\$$", "", do.call(paste0, expand.grid(stubs, levs)[if (end_stub) 2:1 else 1:2]))
full_names <- c(id_names, stub_levs)
full_names <- full_names[order(full_names)]
levs <- levs[order(levs)]
miss <- setdiff(full_names, current_names)
list(stubs = stubs,
stub_names = stub_names,
id_names = id_names,
levs = levs,
stub_levs = stub_levs,
full_names = full_names,
miss = miss)
}
ReshapeLong_ <- function(indt, stubs, value.name = NULL, variable.name = NULL, sep = NULL, end_stub = FALSE) {
indt <- copy(indt)
if (!is.data.table(indt)) indt <- as.data.table(indt)
check <- all_names(names(indt), stubs, end_stub)
if (length(check[["miss"]]) > 0) {
nat <- vapply(unname(check[["stubs"]]), function(x) {
typeof(indt[[grep(x, names(indt))[[1]]]][1])
}, character(1L))
for (i in seq_along(nat)) {
COLS <- grep(names(nat[i]), check[["miss"]], value = TRUE)
if (length(COLS) > 0) indt[, (COLS) := NA_type(nat[i])]
}
}
setcolorder(indt, check[["full_names"]])
valn <- if (is.null(names(stubs))) {
if (is.null(value.name)) stubs else value.name
} else {
names(stubs)
}
varn <- if (is.null(variable.name)) "variable" else variable.name
out <- melt(indt, measure = patterns(stubs),
value.name = valn,
variable.name = varn)
setattr(out[[varn]], "levels", check[["levs"]])
out
}
.getDots <- function(...) sapply(substitute(list(...))[-1], deparse)
library(data.table)
ReshapeLong <- function(indt, ..., sep = NULL) {
if (!is.data.table(indt)) indt <- as.data.table(indt)
stubs <- .getDots(...)
mv <- lapply(stubs, function(y) grep(sprintf("^%s", y), names(indt)))
levs <- unique(gsub(paste(stubs, collapse="|"), "", names(indt)[unlist(mv)]))
if (!is.null(sep)) levs <- gsub(sprintf("^%s", sep), "", levs, fixed = TRUE)
melt(indt, measure = mv, value.name = stubs)[
, variable := factor(variable, labels = levs)][]
}
ReshapeLong_ <- function(indt, stubs, sep = NULL) {
if (!is.data.table(indt)) indt <- as.data.table(indt)
mv <- lapply(stubs, function(y) grep(sprintf("^%s", y), names(indt)))
levs <- unique(gsub(paste(stubs, collapse="|"), "", names(indt)[unlist(mv)]))
if (!is.null(sep)) levs <- gsub(sprintf("^%s", sep), "", levs, fixed = TRUE)
melt(indt, measure = mv, value.name = stubs)[
, variable := factor(variable, labels = levs)][]
}
library(foreign)
dadmom <- read.dta("https://stats.idre.ucla.edu/stat/stata/modules/dadmomw.dta")
ReshapeLong(dadmom, name, inc)
ReshapeLong_(dadmom, c("name", "inc"))
library(data.table)
set.seed(2334)
df_full <- data.table(a_alpha = rnorm(10), a_beta = rnorm(10), a_gamma = rnorm(10),
b_alpha = rnorm(10), b_beta = rnorm(10), b_gamma = rnorm(10), id = c(1:10))
df_miss <- copy(df_full)[, c("a_beta", "b_gamma") := NULL][]
df_mess <- copy(df_miss)
setcolorder(df_mess, c(1, 5, 2, 4, 3))
names(df_mess)
stubs_start <- c("a_", "b_")
stubs_end <- c("alpha$", "beta$", "gamma$")
stubs_named_start <- c("A" = "a_", "B" = "b_")
stubs_named_end <- c("ALPHA" = "_alpha$", "BETA" = "_beta$", "GAMMA" = "_gamma$")
ReshapeLong_(df_full, stubs_start)
ReshapeLong_(df_miss, stubs_start)
rl1 <- ReshapeLong_(df_mess, stubs_start)
ReshapeLong_(df_full, stubs_named_start)
ReshapeLong_(df_miss, stubs_named_end, end_stub = TRUE)
# Named stubs take precedence
ReshapeLong_(df_miss, stubs_named_end, value.name = c("x", "Y", "Z"), end_stub = TRUE)
ReshapeLong_(df_full, stubs_end, end_stub = TRUE)
ReshapeLong_(df_miss, c("alpha", "beta", "gamma"), end_stub = TRUE)
rl2 <- ReshapeLong_(df_mess, stubs_end, c("alpha", "beta", "gamma"), end_stub = TRUE)
library(splitstackshape)
merged.stack(df_full, var.stubs = stubs_start, sep = "var.stubs")
ms1 <- merged.stack(df_mess, var.stubs = stubs_start, sep = "var.stubs")
ms2 <- merged.stack(df_mess, var.stubs = stubs_end, sep = "var.stubs", atStart = FALSE)
library(compare)
compare(rl1, ms1, allowAll = TRUE)
compare(rl2, ms2, allowAll = TRUE)
library(microbenchmark)
microbenchmark(ReshapeLong_(df_mess, stubs_start), merged.stack(df_mess, var.stubs = stubs_start, sep = "var.stubs"))
microbenchmark(melt(df_full, measure.vars = patterns(stubs_start), value.name = stubs_start),
ReshapeLong_(df_full, stubs_start),
ReshapeLong_(df_full, stubs_named_start),
merged.stack(df_full, var.stubs = stubs_start, sep = "var.stubs"))
melt(df_full, measure.vars = patterns(stubs_start), value.name = stubs_start)
ReshapeLong_(df_full, stubs_start)
merged.stack(df_full, var.stubs = stubs_start, sep = "var.stubs")
### BIGGER DATA
set.seed(1)
Nrow <- 1000000
Ncol <- 10
mybigdf <- cbind(id = 1:Nrow, as.data.frame(matrix(rnorm(Nrow*Ncol), nrow=Nrow)))
head(mybigdf)
dim(mybigdf)
tail(mybigdf)
A <- names(mybigdf)
names(mybigdf) <- c("id", paste("varA", 1:3, sep = "_"),
paste("varB", 1:4, sep = "_"),
paste("varC", 1:3, sep = "_"))
DT <- as.data.table(mybigdf)
melt(DT, measure.vars = patterns("varA", "varB", "varC"))
ReshapeLong_(DT, c("varA", "varB", "varC"))
ReshapeLong_(DT, c("THIS" = "varA", "THAT" = "varB", "THOSE" = "varC"))
merged.stack(DT, id.vars="id", var.stubs=c("varA", "varB", "varC"), sep = "_")
### SLOW OPTIONS
library(tidyverse)
library(dtplyr)
melt(DT, id.vars = "id")[
, c("col", "lev") := tstrsplit(variable, "_")][
, variable := NULL][
, dcast(.SD, id + lev ~ col, value.var = "value")]
DT %>%
gather(var, val, -id) %>%
separate(var, into = c("col", "lev")) %>%
spread(col, val)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment