Created
January 10, 2020 08:42
-
-
Save traversc/a3b42c2ca0af940df40ed954c40cb315 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(qs) | |
library(data.table) | |
library(dplyr) | |
library(ggplot2) | |
library(fst) | |
library(patchwork) | |
library(ggformula) | |
library(hrbrthemes) | |
library(Rcpp) | |
library(trqwe) | |
outfile <- "~/N/test.z" # Ubuntu WSL | |
# outfile <- "N:/test.z" # Windows | |
# outfile <- "/tmp/test.z" # Mac | |
nr <- 5e6 | |
reps <- 1 | |
mserialize <- function (object, file, mc.cores = min(parallel::detectCores(), 4)) | |
{ | |
con <- pipe(paste0("pigz -p", mc.cores, " > ", file), "wb") | |
serialize(object, con, xdr=F) | |
close(con) | |
} | |
lserialize <- function(object, file, mc.cores = min(parallel::detectCores(), 4)) | |
{ | |
con <- file(file, "wb") | |
serialize(object, con, xdr=F) | |
close(con) | |
} | |
generateData <- function() { | |
data.frame(a=rnorm(nr), b=rpois(100,nr), | |
c=sample(starnames[["IAU Name"]],nr,T), d=factor(sample(state.name,nr,T)), stringsAsFactors = F) | |
} | |
grid <- expand.grid(algo = "lz4", cl = c(1:9,seq(10,150,10)), shuf=c(0, 7), threads=c(1,4), rep=reps, stringsAsFactors = F) | |
grid3 <- expand.grid(algo = "zstd", cl = c(seq(-50,-10,by=5),-5,-4,-3,-2,-1,1:15), shuf=c(7), threads=c(1,4), rep=reps, stringsAsFactors = F) | |
grid4 <- expand.grid(algo = "zstd_stream", cl = c(seq(-50,-10,by=5),-5,-4,-3,-2,-1,1:15), shuf=c(7), threads=c(1), rep=reps, stringsAsFactors = F) | |
grid <- rbind(grid, grid3, grid4) | |
grid <- sample_n(grid, nrow(grid)) | |
res <- lapply(1:nrow(grid), function(i) { | |
print(grid[i,]) | |
x1 <- generateData() | |
time <- as.numeric(Sys.time()) | |
qsave(x1, file=outfile, preset = "custom", compress_level=grid$cl[i], shuffle_control = grid$shuf[i], algorithm = grid$algo[i], nthreads=grid$threads[i], check_hash = F) | |
time <- as.numeric(Sys.time()) - time | |
rm(x1) | |
gc() | |
read_time <- as.numeric(Sys.time()) | |
x1 <- qread(file=outfile) | |
read_time <- as.numeric(Sys.time()) - read_time | |
fs <- file.info(outfile)$size | |
rm(x1) | |
gc() | |
data.frame(time, fs, read_time) | |
}) %>% rbindlist | |
res <- cbind(grid, res) | |
grid2 <- expand.grid(algo = "fst", cl = seq(0,85,by=5), threads = c(1,4), rep=reps) | |
grid2 <- sample_n(grid2, nrow(grid2)) | |
res2 <- lapply(1:nrow(grid2), function(i) { | |
print(grid2[i,]) | |
x1 <- generateData() | |
time <- as.numeric(Sys.time()) | |
threads_fst(grid2$threads[i]) | |
write_fst(x1, path=outfile, compress = grid2$cl[i]) | |
time <- as.numeric(Sys.time()) - time | |
rm(x1) | |
gc() | |
read_time <- as.numeric(Sys.time()) | |
x1 <- read_fst(outfile) | |
read_time <- as.numeric(Sys.time()) - read_time | |
fs <- file.info(outfile)$size | |
rm(x1) | |
gc() | |
data.frame(time, fs, read_time) | |
}) %>% rbindlist | |
res2 <- cbind(grid2, res2) | |
gridb <- expand.grid(algo = "saveRDS", threads = c(1), cl = NA, rep=reps) | |
gridb <- sample_n(gridb, nrow(gridb)) | |
resb <- lapply(1:nrow(gridb), function(i) { | |
savefun <- ifelse(gridb$threads[i] == 1, lserialize, mserialize) | |
readfun <- ifelse(gridb$threads[i] == 1, readRDS, mcreadRDS) | |
print(gridb[i,]) | |
x1 <- generateData() | |
time <- as.numeric(Sys.time()) | |
savefun(x1, outfile) | |
time <- as.numeric(Sys.time()) - time | |
rm(x1) | |
gc() | |
read_time <- as.numeric(Sys.time()) | |
x1 <- readfun(outfile) | |
read_time <- as.numeric(Sys.time()) - read_time | |
fs <- file.info(outfile)$size | |
rm(x1) | |
gc() | |
data.frame(time, fs, read_time) | |
}) %>% rbindlist | |
resb <- cbind(gridb, resb) | |
res <- res %>% mutate(method = sprintf("qs:%s shuffle=%s", algo, shuf)) | |
res2 <- res2 %>% mutate(method = paste0("fst")) | |
resb <- resb %>% mutate(method = paste0("saveRDS / readRDS")) | |
rf <- rbind(res %>% dplyr::select(method, threads, time, read_time, fs, cl), | |
res2 %>% dplyr::select(method, threads, time, read_time, fs, cl), | |
resb %>% dplyr::select(method, threads, time, read_time, fs, cl)) %>% | |
mutate(fs = fs/1e6) | |
saveRDS(rf, file="bench_data_v20.1.rds") | |
######################### | |
rf <- readRDS("bench_data_v20.1.rds") | |
rf <- filter(rf, !(method %like% "qs" & cl == 120)) %>% filter(!(method %like% "fst" & cl == 15)) %>% | |
group_by(method, threads, cl) %>% | |
summarize(time = mean(time), read_time=mean(read_time), fs = mean(fs)) | |
# rf$threads <- ifelse(threads == 1, "1 thread", "4 threads") | |
rfavglab <- rf %>% filter(threads==1) %>% | |
group_by(method, cl) %>% summarize(time = mean(time), read_time=mean(read_time), fs = mean(fs)) %>% | |
filter( (method == "qs:lz4 shuffle=0" & cl == 150) | (method == "qs:lz4 shuffle=7" & cl == 1) | (method == "qs:zstd shuffle=7" & cl == 4)) %>% | |
arrange(method) %>% ungroup %>% | |
mutate(label = c("fast", "balanced", "high")) | |
# rfavg <- rf %>% | |
# group_by(method, cl) %>% summarize(time = mean(time), fs = mean(fs)) | |
g <- ggplot() + | |
geom_point(data = rf, mapping=aes(x=fs, color=method, y=time, shape = factor(threads)), alpha=0.5, show.legend = T) + | |
# geom_spline(data = rf, mapping=aes(x=fs, color=method, y=time, lty=factor(threads)), nknots=5, spar = 0.25, lwd=.81) + | |
# geom_line(data = rf, mapping=aes(x=fs, color=method, y=time, lty=factor(threads)), lwd=.81) + | |
geom_point(data=rfavglab, mapping=aes(x = fs, y = time), color="black", alpha=1, show.legend=F) + | |
geom_text(data=rfavglab, mapping=aes(x = fs, y = time+.05, label = label), color="black", size=3.5, show.legend=F) + | |
theme_ipsum_rc() + | |
theme(plot.margin = unit(rep(.5,4), "lines"), | |
# legend.position="bottom", | |
legend.key.size = unit(1, 'lines'), | |
legend.box.just = "left", | |
legend.text = element_text(margin = margin(r = 1, unit = "lines")), | |
axis.title.x = element_text(size=rel(1.3)), | |
axis.title.y = element_text(size=rel(1.3))) + | |
labs(x = "File size (MB)", y = "Time (s)", color = "Algorithm", shape = "Threads") + | |
scale_y_log10(breaks=c(.1,.2,.5,1,2,5,10,20,50)) # + | |
# scale_x_sqrt(limits=c(NA,NA), breaks=seq(0,500,by=12)) + | |
# guides(color=guide_legend(nrow=2,byrow=T)) | |
plot(g) | |
ggsave(g, file = "df_bench_complete_write_v20.1.png", width=6.5, height=4.5, dpi=300) | |
g <- ggplot() + | |
geom_point(data = rf, mapping=aes(x=fs, color=method, y=read_time, shape = factor(threads)), alpha=0.5, show.legend = F) + | |
# geom_spline(data = rf, mapping=aes(x=fs, color=method, y=read_time, lty=factor(threads)), nknots=5, spar = 0.25, lwd=.81) + | |
geom_line(data = rf, mapping=aes(x=fs, color=method, y=read_time, lty=factor(threads)), lwd=.81) + | |
geom_point(data=rfavglab, mapping=aes(x = fs, y = read_time), color="black", alpha=1) + | |
geom_text(data=rfavglab, mapping=aes(x = fs, y = read_time+.05, label = label), color="black", size=3.5) + | |
theme_ipsum_rc() + | |
theme(plot.margin = unit(rep(.5,4), "lines"), legend.position="right", | |
legend.key.size = unit(1, 'lines'), | |
legend.box.just = "left", | |
legend.text = element_text(margin = margin(r = 1, unit = "lines")), | |
axis.title.x = element_text(size=rel(1.3)), | |
axis.title.y = element_text(size=rel(1.3))) + | |
labs(x = "File size (MB)", y = "Time (s)", color = "Algorithm", lty = "Threads") + | |
scale_y_log10(breaks=c(.1,.2,.5,1,2,5,10,20,50)) # + | |
# scale_x_sqrt(limits=c(NA,NA), breaks=seq(0,500,by=12)) + | |
# guides(color=guide_legend(nrow=2,byrow=T)) | |
plot(g) | |
ggsave(g, file = "df_bench_complete_read_v20.1.png", width=6.5, height=4.5, dpi=300) | |
sumtab <- rf %>% | |
group_by(method, cl, threads) %>% summarize(`Write Time (s)` = mean(time), `Read Time (s)`=mean(read_time), `File Size (Mb)` = mean(fs)) %>% | |
filter( (method == "qs:lz4 shuffle=0" & cl == 100) | | |
(method == "qs:lz4 shuffle=7" & cl == 1) | | |
(method == "qs:zstd shuffle=7" & cl == 4) | | |
(method == "qs:zstd_stream shuffle=7" & cl == 14) | | |
(method == "fst" & cl == 0) | | |
(method == "fst" & cl == 50) | | |
(method == "fst" & cl == 85) | | |
(method == "saveRDS / readRDS") ) %>% | |
dplyr::rename(Algorithm = method, Threads = threads) %>% ungroup | |
sumtab <- sumtab %>% | |
mutate(Algorithm = ifelse(Algorithm != "saveRDS / readRDS", sprintf("%s C=%s", Algorithm, cl), Algorithm)) %>% | |
dplyr::select(Algorithm, everything()) %>% dplyr::select(-cl) | |
sumtab$Algorithm[7] <- paste(sumtab$Algorithm[7], "(fast)") | |
sumtab$Algorithm[9] <- paste(sumtab$Algorithm[9], "(balanced)") | |
sumtab$Algorithm[11] <- paste(sumtab$Algorithm[11], "(high)") | |
sumtab$Algorithm[13] <- paste(sumtab$Algorithm[13], "(archive)") | |
sumtab <- rbind(sumtab[14:15, ], sumtab[1:13, ]) | |
# write.csv(sumtab, file="df_bench_summary.csv", row.names=F) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment