Last active
September 26, 2021 03:54
-
-
Save rnfermincota/74169e2e73f199165e52b8393e1fbb01 to your computer and use it in GitHub Desktop.
Separate and unite are complements
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
rm(list=ls()) | |
#----------------------------------------------------------------------------- | |
library(dplyr) | |
library(tidyr) | |
library(data.table) | |
#----------------------------------------------------------------------------- | |
#---------------------------------------------------------------------------- | |
# Example5 | |
#---------------------------------------------------------------------------- | |
dt = data.table( | |
x=sample(5,20,TRUE), | |
y=sample(2,20,TRUE), | |
z=sample(letters[1:2], 20,TRUE), | |
d1 = runif(20), | |
d2=1L | |
) | |
df <- tbl_df(dt) | |
# multiple fun.agg and value.var (all combinations) | |
out1 <- dcast.data.table( | |
dt, | |
x + y ~ z, | |
fun=list(sum, mean), | |
value.var=c("d1", "d2"), | |
variable.factor = FALSE | |
) | |
out2 <- df %>% | |
group_by(x, y, z) %>% summarise( | |
dsum1=sum(d1), | |
dmean1=mean(d1), | |
dsum2=sum(d2), | |
dmean2=mean(d2) | |
) | |
out2 <- out2 %>% | |
unite(a1_b1, dsum1, dmean1, sep=",") %>% | |
unite(a2_b2, dsum2, dmean2, sep=",") %>% | |
unite(a1_b1_a2_b2, a1_b1, a2_b2, sep="|") %>% | |
spread(z, a1_b1_a2_b2) | |
iff <- function(x, v=0){ | |
if (is.na(x) | x == "NA"){ | |
if (v==0){ | |
return("NA,NA|NA,NA") | |
}else{ | |
return(0) | |
} | |
}else{ | |
if (x==0){ | |
return("0,0|0,0") | |
}else{ | |
return(x) | |
} | |
} | |
} | |
out2 <- out2 %>% | |
mutate(a=sapply(a, function(x) iff(x, 0)), b=sapply(b, function(x) iff(x,0))) | |
out2 <- out2 %>% | |
separate(a, c("d1_a", "d2_a"), sep="[|]") %>% | |
separate(b, c("d1_b", "d2_b"), sep="[|]") | |
out2 <- out2 %>% | |
separate(d1_a, c("d1_sum_a", "d1_mean_a"), convert=TRUE, sep=",") %>% | |
separate(d2_a, c("d2_sum_a", "d2_mean_a"), convert=TRUE, sep=",") %>% | |
separate(d1_b, c("d1_sum_b", "d1_mean_b"), convert=TRUE, sep=",") %>% | |
separate(d2_b, c("d2_sum_b", "d2_mean_b"), convert=TRUE, sep=",") %>% | |
select(x, y, d1_sum_a, d1_sum_b, d2_sum_a, d2_sum_b, d1_mean_a, d1_mean_b, d2_mean_a, d2_mean_b) | |
out2 <- out2 %>% | |
mutate( | |
d1_sum_a=sapply(d1_sum_a, function(x) iff(x,1)), | |
d1_sum_b=sapply(d1_sum_b, function(x) iff(x,1)), | |
d2_sum_a=sapply(d2_sum_a, function(x) as.integer(iff(x,1))), | |
d2_sum_b=sapply(d2_sum_b, function(x) as.integer(iff(x,1))) | |
) | |
all.equal(as.data.frame(out1), out2, check.attributes=FALSE) | |
rm(df, dt) | |
rm(iff, out1, out2) | |
#---------------------------------------------------------------------------- | |
# Example4 | |
#---------------------------------------------------------------------------- | |
dt = data.table( | |
x=sample(5,20,TRUE), | |
y=sample(2,20,TRUE), | |
z=sample(letters[1:2], 20,TRUE), | |
d1 = runif(20), | |
d2=1L | |
) | |
df <- tbl_df(dt) | |
out1 <- dcast.data.table( | |
dt, | |
x + y ~ z, | |
fun=sum, | |
value.var=c("d1","d2"), | |
variable.factor = FALSE | |
)[order(x,y)] | |
out2 <- df %>% | |
group_by(x, y, z) %>% | |
summarise( | |
d1_sum=sum(d1), | |
d2_sum=sum(d2) | |
) | |
out2 <- out2 %>% | |
unite(a_b, d1_sum, d2_sum, sep=",") %>% | |
spread(z, a_b, fill = 0) %>% | |
mutate( | |
a=ifelse(a==0, "0,0", a), | |
b=ifelse(b==0, "0,0", b) | |
) | |
out2 <- out2 %>% | |
separate(a, c("d1_sum_a", "d2_sum_a"), sep=",", convert=TRUE) %>% | |
separate(b, c("d1_sum_b", "d2_sum_b"), sep=",", convert=TRUE) %>% | |
select(x, y, d1_sum_a, d1_sum_b, d2_sum_a, d2_sum_b) | |
all.equal(as.data.frame(out1), out2, check.attributes=FALSE) | |
rm(df, dt) | |
rm(out1, out2) | |
#---------------------------------------------------------------------------- | |
# Example3 | |
#---------------------------------------------------------------------------- | |
set.seed(45) | |
DT <- data.table( | |
i_1 = c(1:5, NA), | |
i_2 = c(NA,6,7,8,9,10), | |
f_1 = factor(sample(c(letters[1:3], NA), 6, TRUE)), | |
f_2 = factor(c("z", "a", "x", "c", "x", "x"), ordered=TRUE), | |
c_1 = sample(c(letters[1:3], NA), 6, TRUE), | |
d_1 = as.Date(c(1:3,NA,4:5), origin="2013-09-01"), | |
d_2 = as.Date(6:1, origin="2012-01-01")) | |
DF <- tibble(DT) | |
# add a couple of list cols | |
set.seed(123) | |
DT[, l_1 := DT[, list(c=list(rep(i_1, sample(5,1)))), by = i_1]$c] | |
set.seed(123) | |
DF <- DF %>% | |
group_by(i_1) %>% | |
mutate(l_1 = list(rep(i_1, sample(5,1)))) %>% | |
ungroup | |
all.equal(data.frame(DT), DF, check.attributes = FALSE) | |
set.seed(123) | |
DT[, l_2 := DT[, list(c=list(rep(c_1, sample(5,1)))), by = i_1]$c] | |
set.seed(123) | |
DF <- DF %>% | |
group_by(i_1) %>% | |
mutate(l_2 = list(rep(c_1, sample(5,1)))) %>% | |
ungroup | |
all.equal(data.frame(DT), DF, check.attributes = FALSE) | |
out1 <- melt.data.table( | |
DT, | |
id=1:2, | |
measure=patterns("^f_", "^d_"), | |
value.factor=TRUE, | |
variable.factor = FALSE) | |
out2 <- DF %>% select(-matches("^c_|^l_")) %>% | |
unite("1", f_1, d_1, sep=",") %>% | |
unite("2", f_2, d_2, sep=",") | |
out2 <- out2 %>% | |
gather(variable, value, 3:4) %>% | |
separate(value, c("value1", "value2"), sep=",", convert=TRUE) | |
out2 <- out2 %>% mutate( | |
value1=factor(value1, levels=sort(unique(out2$value1)), ordered=TRUE), | |
value2=as.Date(value2) | |
) | |
all.equal(out1, as.data.table(out2)) | |
rm(DF, DT) | |
rm(out1, out2) | |
#---------------------------------------------------------------------------- | |
# Example2 | |
#---------------------------------------------------------------------------- | |
df <- data.frame(x = c("a", "a b", "a b c", NA), stringsAsFactors = FALSE) | |
dt <- data.table(df) | |
fstrsplit <- function(x, d){ | |
v <- strsplit(x, d, fixed = FALSE) | |
for (i in 1:length(v)){ | |
if (length(v[[i]]) == 1){v[[i]][2]<- v[[i]][1]; v[[i]][1]<- NA} | |
if (length(v[[i]]) > 2){v[[i]][2] <- paste(v[[i]][2:length(v[[i]])], collapse=" ")} | |
} | |
n <- length(v[[1]]) # assuming all lists in before have the same length | |
# http://stackoverflow.com/questions/16179197/transpose-a-list-of-lists | |
lapply(1:n, function(i) sapply(v, "[[", i)) | |
} | |
out1 <- df %>% separate(x, c("a", "b"), extra = "merge", fill = "left") | |
out2 <- dt[, c("a", "b") := fstrsplit(x, " ")][,x:=NULL] # TO REVISE: WRONG | |
all.equal(out1, out2, check.attributes=FALSE) | |
rm(out1, out2) | |
rm(df, dt) | |
rm(fstrsplit) | |
#----------------------------------------------------------------------------- | |
# Example1 | |
#----------------------------------------------------------------------------- | |
out1 <- unite_(mtcars, "vs_am", c("vs","am")) | |
out2 <- data.table(mtcars) | |
out2[, "vs_am" := paste0(vs, "_", am)][ ,`:=`(vs = NULL, am = NULL)] | |
# http://stackoverflow.com/questions/16638484/remove-multiple-columns-from-data-table | |
setcolorder( | |
out2, | |
c("mpg","cyl","disp","hp","drat","wt","qsec","vs_am","gear","carb") | |
) | |
all.equal(out1, out2, check.attributes=FALSE) | |
out1 <- mtcars %>% | |
unite(vs_am, vs, am) %>% | |
separate(vs_am, c("vs", "am")) | |
out2 <- out2[, c("vs", "am") := tstrsplit(vs_am, "_", fixed=TRUE)][,vs_am:=NULL] | |
setcolorder( | |
out2, | |
c("mpg","cyl","disp","hp","drat","wt","qsec","vs","am","gear","carb") | |
) | |
all.equal(out1, out2, check.attributes=FALSE) | |
rm(out1, out2) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment