Skip to content

Instantly share code, notes, and snippets.

@traversc
Created January 10, 2020 08:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save traversc/a3b42c2ca0af940df40ed954c40cb315 to your computer and use it in GitHub Desktop.
Save traversc/a3b42c2ca0af940df40ed954c40cb315 to your computer and use it in GitHub Desktop.
library(qs)
library(data.table)
library(dplyr)
library(ggplot2)
library(fst)
library(patchwork)
library(ggformula)
library(hrbrthemes)
library(Rcpp)
library(trqwe)
outfile <- "~/N/test.z" # Ubuntu WSL
# outfile <- "N:/test.z" # Windows
# outfile <- "/tmp/test.z" # Mac
nr <- 5e6
reps <- 1
mserialize <- function (object, file, mc.cores = min(parallel::detectCores(), 4))
{
con <- pipe(paste0("pigz -p", mc.cores, " > ", file), "wb")
serialize(object, con, xdr=F)
close(con)
}
lserialize <- function(object, file, mc.cores = min(parallel::detectCores(), 4))
{
con <- file(file, "wb")
serialize(object, con, xdr=F)
close(con)
}
generateData <- function() {
data.frame(a=rnorm(nr), b=rpois(100,nr),
c=sample(starnames[["IAU Name"]],nr,T), d=factor(sample(state.name,nr,T)), stringsAsFactors = F)
}
grid <- expand.grid(algo = "lz4", cl = c(1:9,seq(10,150,10)), shuf=c(0, 7), threads=c(1,4), rep=reps, stringsAsFactors = F)
grid3 <- expand.grid(algo = "zstd", cl = c(seq(-50,-10,by=5),-5,-4,-3,-2,-1,1:15), shuf=c(7), threads=c(1,4), rep=reps, stringsAsFactors = F)
grid4 <- expand.grid(algo = "zstd_stream", cl = c(seq(-50,-10,by=5),-5,-4,-3,-2,-1,1:15), shuf=c(7), threads=c(1), rep=reps, stringsAsFactors = F)
grid <- rbind(grid, grid3, grid4)
grid <- sample_n(grid, nrow(grid))
res <- lapply(1:nrow(grid), function(i) {
print(grid[i,])
x1 <- generateData()
time <- as.numeric(Sys.time())
qsave(x1, file=outfile, preset = "custom", compress_level=grid$cl[i], shuffle_control = grid$shuf[i], algorithm = grid$algo[i], nthreads=grid$threads[i], check_hash = F)
time <- as.numeric(Sys.time()) - time
rm(x1)
gc()
read_time <- as.numeric(Sys.time())
x1 <- qread(file=outfile)
read_time <- as.numeric(Sys.time()) - read_time
fs <- file.info(outfile)$size
rm(x1)
gc()
data.frame(time, fs, read_time)
}) %>% rbindlist
res <- cbind(grid, res)
grid2 <- expand.grid(algo = "fst", cl = seq(0,85,by=5), threads = c(1,4), rep=reps)
grid2 <- sample_n(grid2, nrow(grid2))
res2 <- lapply(1:nrow(grid2), function(i) {
print(grid2[i,])
x1 <- generateData()
time <- as.numeric(Sys.time())
threads_fst(grid2$threads[i])
write_fst(x1, path=outfile, compress = grid2$cl[i])
time <- as.numeric(Sys.time()) - time
rm(x1)
gc()
read_time <- as.numeric(Sys.time())
x1 <- read_fst(outfile)
read_time <- as.numeric(Sys.time()) - read_time
fs <- file.info(outfile)$size
rm(x1)
gc()
data.frame(time, fs, read_time)
}) %>% rbindlist
res2 <- cbind(grid2, res2)
gridb <- expand.grid(algo = "saveRDS", threads = c(1), cl = NA, rep=reps)
gridb <- sample_n(gridb, nrow(gridb))
resb <- lapply(1:nrow(gridb), function(i) {
savefun <- ifelse(gridb$threads[i] == 1, lserialize, mserialize)
readfun <- ifelse(gridb$threads[i] == 1, readRDS, mcreadRDS)
print(gridb[i,])
x1 <- generateData()
time <- as.numeric(Sys.time())
savefun(x1, outfile)
time <- as.numeric(Sys.time()) - time
rm(x1)
gc()
read_time <- as.numeric(Sys.time())
x1 <- readfun(outfile)
read_time <- as.numeric(Sys.time()) - read_time
fs <- file.info(outfile)$size
rm(x1)
gc()
data.frame(time, fs, read_time)
}) %>% rbindlist
resb <- cbind(gridb, resb)
res <- res %>% mutate(method = sprintf("qs:%s shuffle=%s", algo, shuf))
res2 <- res2 %>% mutate(method = paste0("fst"))
resb <- resb %>% mutate(method = paste0("saveRDS / readRDS"))
rf <- rbind(res %>% dplyr::select(method, threads, time, read_time, fs, cl),
res2 %>% dplyr::select(method, threads, time, read_time, fs, cl),
resb %>% dplyr::select(method, threads, time, read_time, fs, cl)) %>%
mutate(fs = fs/1e6)
saveRDS(rf, file="bench_data_v20.1.rds")
#########################
rf <- readRDS("bench_data_v20.1.rds")
rf <- filter(rf, !(method %like% "qs" & cl == 120)) %>% filter(!(method %like% "fst" & cl == 15)) %>%
group_by(method, threads, cl) %>%
summarize(time = mean(time), read_time=mean(read_time), fs = mean(fs))
# rf$threads <- ifelse(threads == 1, "1 thread", "4 threads")
rfavglab <- rf %>% filter(threads==1) %>%
group_by(method, cl) %>% summarize(time = mean(time), read_time=mean(read_time), fs = mean(fs)) %>%
filter( (method == "qs:lz4 shuffle=0" & cl == 150) | (method == "qs:lz4 shuffle=7" & cl == 1) | (method == "qs:zstd shuffle=7" & cl == 4)) %>%
arrange(method) %>% ungroup %>%
mutate(label = c("fast", "balanced", "high"))
# rfavg <- rf %>%
# group_by(method, cl) %>% summarize(time = mean(time), fs = mean(fs))
g <- ggplot() +
geom_point(data = rf, mapping=aes(x=fs, color=method, y=time, shape = factor(threads)), alpha=0.5, show.legend = T) +
# geom_spline(data = rf, mapping=aes(x=fs, color=method, y=time, lty=factor(threads)), nknots=5, spar = 0.25, lwd=.81) +
# geom_line(data = rf, mapping=aes(x=fs, color=method, y=time, lty=factor(threads)), lwd=.81) +
geom_point(data=rfavglab, mapping=aes(x = fs, y = time), color="black", alpha=1, show.legend=F) +
geom_text(data=rfavglab, mapping=aes(x = fs, y = time+.05, label = label), color="black", size=3.5, show.legend=F) +
theme_ipsum_rc() +
theme(plot.margin = unit(rep(.5,4), "lines"),
# legend.position="bottom",
legend.key.size = unit(1, 'lines'),
legend.box.just = "left",
legend.text = element_text(margin = margin(r = 1, unit = "lines")),
axis.title.x = element_text(size=rel(1.3)),
axis.title.y = element_text(size=rel(1.3))) +
labs(x = "File size (MB)", y = "Time (s)", color = "Algorithm", shape = "Threads") +
scale_y_log10(breaks=c(.1,.2,.5,1,2,5,10,20,50)) # +
# scale_x_sqrt(limits=c(NA,NA), breaks=seq(0,500,by=12)) +
# guides(color=guide_legend(nrow=2,byrow=T))
plot(g)
ggsave(g, file = "df_bench_complete_write_v20.1.png", width=6.5, height=4.5, dpi=300)
g <- ggplot() +
geom_point(data = rf, mapping=aes(x=fs, color=method, y=read_time, shape = factor(threads)), alpha=0.5, show.legend = F) +
# geom_spline(data = rf, mapping=aes(x=fs, color=method, y=read_time, lty=factor(threads)), nknots=5, spar = 0.25, lwd=.81) +
geom_line(data = rf, mapping=aes(x=fs, color=method, y=read_time, lty=factor(threads)), lwd=.81) +
geom_point(data=rfavglab, mapping=aes(x = fs, y = read_time), color="black", alpha=1) +
geom_text(data=rfavglab, mapping=aes(x = fs, y = read_time+.05, label = label), color="black", size=3.5) +
theme_ipsum_rc() +
theme(plot.margin = unit(rep(.5,4), "lines"), legend.position="right",
legend.key.size = unit(1, 'lines'),
legend.box.just = "left",
legend.text = element_text(margin = margin(r = 1, unit = "lines")),
axis.title.x = element_text(size=rel(1.3)),
axis.title.y = element_text(size=rel(1.3))) +
labs(x = "File size (MB)", y = "Time (s)", color = "Algorithm", lty = "Threads") +
scale_y_log10(breaks=c(.1,.2,.5,1,2,5,10,20,50)) # +
# scale_x_sqrt(limits=c(NA,NA), breaks=seq(0,500,by=12)) +
# guides(color=guide_legend(nrow=2,byrow=T))
plot(g)
ggsave(g, file = "df_bench_complete_read_v20.1.png", width=6.5, height=4.5, dpi=300)
sumtab <- rf %>%
group_by(method, cl, threads) %>% summarize(`Write Time (s)` = mean(time), `Read Time (s)`=mean(read_time), `File Size (Mb)` = mean(fs)) %>%
filter( (method == "qs:lz4 shuffle=0" & cl == 100) |
(method == "qs:lz4 shuffle=7" & cl == 1) |
(method == "qs:zstd shuffle=7" & cl == 4) |
(method == "qs:zstd_stream shuffle=7" & cl == 14) |
(method == "fst" & cl == 0) |
(method == "fst" & cl == 50) |
(method == "fst" & cl == 85) |
(method == "saveRDS / readRDS") ) %>%
dplyr::rename(Algorithm = method, Threads = threads) %>% ungroup
sumtab <- sumtab %>%
mutate(Algorithm = ifelse(Algorithm != "saveRDS / readRDS", sprintf("%s C=%s", Algorithm, cl), Algorithm)) %>%
dplyr::select(Algorithm, everything()) %>% dplyr::select(-cl)
sumtab$Algorithm[7] <- paste(sumtab$Algorithm[7], "(fast)")
sumtab$Algorithm[9] <- paste(sumtab$Algorithm[9], "(balanced)")
sumtab$Algorithm[11] <- paste(sumtab$Algorithm[11], "(high)")
sumtab$Algorithm[13] <- paste(sumtab$Algorithm[13], "(archive)")
sumtab <- rbind(sumtab[14:15, ], sumtab[1:13, ])
# write.csv(sumtab, file="df_bench_summary.csv", row.names=F)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment