Skip to content

Instantly share code, notes, and snippets.

@rnfermincota
Last active September 26, 2021 03:54
Show Gist options
  • Save rnfermincota/74169e2e73f199165e52b8393e1fbb01 to your computer and use it in GitHub Desktop.
Save rnfermincota/74169e2e73f199165e52b8393e1fbb01 to your computer and use it in GitHub Desktop.
Separate and unite are complements
rm(list=ls())
#-----------------------------------------------------------------------------
library(dplyr)
library(tidyr)
library(data.table)
#-----------------------------------------------------------------------------
#----------------------------------------------------------------------------
# Example5
#----------------------------------------------------------------------------
dt = data.table(
x=sample(5,20,TRUE),
y=sample(2,20,TRUE),
z=sample(letters[1:2], 20,TRUE),
d1 = runif(20),
d2=1L
)
df <- tbl_df(dt)
# multiple fun.agg and value.var (all combinations)
out1 <- dcast.data.table(
dt,
x + y ~ z,
fun=list(sum, mean),
value.var=c("d1", "d2"),
variable.factor = FALSE
)
out2 <- df %>%
group_by(x, y, z) %>% summarise(
dsum1=sum(d1),
dmean1=mean(d1),
dsum2=sum(d2),
dmean2=mean(d2)
)
out2 <- out2 %>%
unite(a1_b1, dsum1, dmean1, sep=",") %>%
unite(a2_b2, dsum2, dmean2, sep=",") %>%
unite(a1_b1_a2_b2, a1_b1, a2_b2, sep="|") %>%
spread(z, a1_b1_a2_b2)
iff <- function(x, v=0){
if (is.na(x) | x == "NA"){
if (v==0){
return("NA,NA|NA,NA")
}else{
return(0)
}
}else{
if (x==0){
return("0,0|0,0")
}else{
return(x)
}
}
}
out2 <- out2 %>%
mutate(a=sapply(a, function(x) iff(x, 0)), b=sapply(b, function(x) iff(x,0)))
out2 <- out2 %>%
separate(a, c("d1_a", "d2_a"), sep="[|]") %>%
separate(b, c("d1_b", "d2_b"), sep="[|]")
out2 <- out2 %>%
separate(d1_a, c("d1_sum_a", "d1_mean_a"), convert=TRUE, sep=",") %>%
separate(d2_a, c("d2_sum_a", "d2_mean_a"), convert=TRUE, sep=",") %>%
separate(d1_b, c("d1_sum_b", "d1_mean_b"), convert=TRUE, sep=",") %>%
separate(d2_b, c("d2_sum_b", "d2_mean_b"), convert=TRUE, sep=",") %>%
select(x, y, d1_sum_a, d1_sum_b, d2_sum_a, d2_sum_b, d1_mean_a, d1_mean_b, d2_mean_a, d2_mean_b)
out2 <- out2 %>%
mutate(
d1_sum_a=sapply(d1_sum_a, function(x) iff(x,1)),
d1_sum_b=sapply(d1_sum_b, function(x) iff(x,1)),
d2_sum_a=sapply(d2_sum_a, function(x) as.integer(iff(x,1))),
d2_sum_b=sapply(d2_sum_b, function(x) as.integer(iff(x,1)))
)
all.equal(as.data.frame(out1), out2, check.attributes=FALSE)
rm(df, dt)
rm(iff, out1, out2)
#----------------------------------------------------------------------------
# Example4
#----------------------------------------------------------------------------
dt = data.table(
x=sample(5,20,TRUE),
y=sample(2,20,TRUE),
z=sample(letters[1:2], 20,TRUE),
d1 = runif(20),
d2=1L
)
df <- tbl_df(dt)
out1 <- dcast.data.table(
dt,
x + y ~ z,
fun=sum,
value.var=c("d1","d2"),
variable.factor = FALSE
)[order(x,y)]
out2 <- df %>%
group_by(x, y, z) %>%
summarise(
d1_sum=sum(d1),
d2_sum=sum(d2)
)
out2 <- out2 %>%
unite(a_b, d1_sum, d2_sum, sep=",") %>%
spread(z, a_b, fill = 0) %>%
mutate(
a=ifelse(a==0, "0,0", a),
b=ifelse(b==0, "0,0", b)
)
out2 <- out2 %>%
separate(a, c("d1_sum_a", "d2_sum_a"), sep=",", convert=TRUE) %>%
separate(b, c("d1_sum_b", "d2_sum_b"), sep=",", convert=TRUE) %>%
select(x, y, d1_sum_a, d1_sum_b, d2_sum_a, d2_sum_b)
all.equal(as.data.frame(out1), out2, check.attributes=FALSE)
rm(df, dt)
rm(out1, out2)
#----------------------------------------------------------------------------
# Example3
#----------------------------------------------------------------------------
set.seed(45)
DT <- data.table(
i_1 = c(1:5, NA),
i_2 = c(NA,6,7,8,9,10),
f_1 = factor(sample(c(letters[1:3], NA), 6, TRUE)),
f_2 = factor(c("z", "a", "x", "c", "x", "x"), ordered=TRUE),
c_1 = sample(c(letters[1:3], NA), 6, TRUE),
d_1 = as.Date(c(1:3,NA,4:5), origin="2013-09-01"),
d_2 = as.Date(6:1, origin="2012-01-01"))
DF <- tibble(DT)
# add a couple of list cols
set.seed(123)
DT[, l_1 := DT[, list(c=list(rep(i_1, sample(5,1)))), by = i_1]$c]
set.seed(123)
DF <- DF %>%
group_by(i_1) %>%
mutate(l_1 = list(rep(i_1, sample(5,1)))) %>%
ungroup
all.equal(data.frame(DT), DF, check.attributes = FALSE)
set.seed(123)
DT[, l_2 := DT[, list(c=list(rep(c_1, sample(5,1)))), by = i_1]$c]
set.seed(123)
DF <- DF %>%
group_by(i_1) %>%
mutate(l_2 = list(rep(c_1, sample(5,1)))) %>%
ungroup
all.equal(data.frame(DT), DF, check.attributes = FALSE)
out1 <- melt.data.table(
DT,
id=1:2,
measure=patterns("^f_", "^d_"),
value.factor=TRUE,
variable.factor = FALSE)
out2 <- DF %>% select(-matches("^c_|^l_")) %>%
unite("1", f_1, d_1, sep=",") %>%
unite("2", f_2, d_2, sep=",")
out2 <- out2 %>%
gather(variable, value, 3:4) %>%
separate(value, c("value1", "value2"), sep=",", convert=TRUE)
out2 <- out2 %>% mutate(
value1=factor(value1, levels=sort(unique(out2$value1)), ordered=TRUE),
value2=as.Date(value2)
)
all.equal(out1, as.data.table(out2))
rm(DF, DT)
rm(out1, out2)
#----------------------------------------------------------------------------
# Example2
#----------------------------------------------------------------------------
df <- data.frame(x = c("a", "a b", "a b c", NA), stringsAsFactors = FALSE)
dt <- data.table(df)
fstrsplit <- function(x, d){
v <- strsplit(x, d, fixed = FALSE)
for (i in 1:length(v)){
if (length(v[[i]]) == 1){v[[i]][2]<- v[[i]][1]; v[[i]][1]<- NA}
if (length(v[[i]]) > 2){v[[i]][2] <- paste(v[[i]][2:length(v[[i]])], collapse=" ")}
}
n <- length(v[[1]]) # assuming all lists in before have the same length
# http://stackoverflow.com/questions/16179197/transpose-a-list-of-lists
lapply(1:n, function(i) sapply(v, "[[", i))
}
out1 <- df %>% separate(x, c("a", "b"), extra = "merge", fill = "left")
out2 <- dt[, c("a", "b") := fstrsplit(x, " ")][,x:=NULL] # TO REVISE: WRONG
all.equal(out1, out2, check.attributes=FALSE)
rm(out1, out2)
rm(df, dt)
rm(fstrsplit)
#-----------------------------------------------------------------------------
# Example1
#-----------------------------------------------------------------------------
out1 <- unite_(mtcars, "vs_am", c("vs","am"))
out2 <- data.table(mtcars)
out2[, "vs_am" := paste0(vs, "_", am)][ ,`:=`(vs = NULL, am = NULL)]
# http://stackoverflow.com/questions/16638484/remove-multiple-columns-from-data-table
setcolorder(
out2,
c("mpg","cyl","disp","hp","drat","wt","qsec","vs_am","gear","carb")
)
all.equal(out1, out2, check.attributes=FALSE)
out1 <- mtcars %>%
unite(vs_am, vs, am) %>%
separate(vs_am, c("vs", "am"))
out2 <- out2[, c("vs", "am") := tstrsplit(vs_am, "_", fixed=TRUE)][,vs_am:=NULL]
setcolorder(
out2,
c("mpg","cyl","disp","hp","drat","wt","qsec","vs","am","gear","carb")
)
all.equal(out1, out2, check.attributes=FALSE)
rm(out1, out2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment