traversc/qs_benchmark.r

## qs_benchmark.r
library(qs)
library(data.table)
library(dplyr)
library(ggplot2)
library(fst)
library(patchwork)
library(ggformula)
library(hrbrthemes)
library(Rcpp)
library(trqwe)

outfile <- "~/N/test.z" # Ubuntu WSL
# outfile <- "N:/test.z" # Windows
# outfile <- "/tmp/test.z" # Mac
nr <- 5e6
reps <- 1

mserialize <- function (object, file, mc.cores = min(parallel::detectCores(), 4))
{
  con <- pipe(paste0("pigz -p", mc.cores, " > ", file), "wb")
  serialize(object, con, xdr=F)
  close(con)
}
lserialize <- function(object, file, mc.cores = min(parallel::detectCores(), 4))
{
  con <- file(file, "wb")
  serialize(object, con, xdr=F)
  close(con)
}

generateData <- function() {
  data.frame(a=rnorm(nr), b=rpois(100,nr),
             c=sample(starnames[["IAU Name"]],nr,T), d=factor(sample(state.name,nr,T)), stringsAsFactors = F)
}

grid <- expand.grid(algo = "lz4", cl = c(1:9,seq(10,150,10)), shuf=c(0, 7), threads=c(1,4), rep=reps, stringsAsFactors = F)
grid3 <- expand.grid(algo = "zstd", cl = c(seq(-50,-10,by=5),-5,-4,-3,-2,-1,1:15), shuf=c(7), threads=c(1,4), rep=reps, stringsAsFactors = F)
grid4 <- expand.grid(algo = "zstd_stream", cl = c(seq(-50,-10,by=5),-5,-4,-3,-2,-1,1:15), shuf=c(7), threads=c(1), rep=reps, stringsAsFactors = F)
grid <- rbind(grid, grid3, grid4)
grid <- sample_n(grid, nrow(grid))

res <- lapply(1:nrow(grid), function(i) {
  print(grid[i,])
  x1 <- generateData()
  time <- as.numeric(Sys.time())
  qsave(x1, file=outfile, preset = "custom", compress_level=grid$cl[i], shuffle_control = grid$shuf[i], algorithm = grid$algo[i], nthreads=grid$threads[i], check_hash = F)
  time <- as.numeric(Sys.time()) - time
  rm(x1)
  gc()
  read_time <- as.numeric(Sys.time())
  x1 <- qread(file=outfile)
  read_time <- as.numeric(Sys.time()) - read_time
  fs <- file.info(outfile)$size
  rm(x1)
  gc()
  data.frame(time, fs, read_time)
}) %>% rbindlist
res <- cbind(grid, res)


grid2 <- expand.grid(algo = "fst", cl = seq(0,85,by=5), threads = c(1,4), rep=reps)
grid2 <- sample_n(grid2, nrow(grid2))
res2 <- lapply(1:nrow(grid2), function(i) {
  print(grid2[i,])
  x1 <- generateData()
  time <- as.numeric(Sys.time())
  threads_fst(grid2$threads[i])
  write_fst(x1, path=outfile, compress = grid2$cl[i])
  time <- as.numeric(Sys.time()) - time
  rm(x1)
  gc()
  read_time <- as.numeric(Sys.time())
  x1 <- read_fst(outfile)
  read_time <- as.numeric(Sys.time()) - read_time
  fs <- file.info(outfile)$size
  rm(x1)
  gc()
  data.frame(time, fs, read_time)
}) %>% rbindlist
res2 <- cbind(grid2, res2)

gridb <- expand.grid(algo = "saveRDS", threads = c(1), cl = NA, rep=reps)
gridb <- sample_n(gridb, nrow(gridb))
resb <- lapply(1:nrow(gridb), function(i) {
  savefun <- ifelse(gridb$threads[i] == 1, lserialize, mserialize)
  readfun <- ifelse(gridb$threads[i] == 1, readRDS, mcreadRDS)
  print(gridb[i,])
  x1 <- generateData()
  time <- as.numeric(Sys.time())
  savefun(x1, outfile)
  time <- as.numeric(Sys.time()) - time
  rm(x1)
  gc()
  read_time <- as.numeric(Sys.time())
  x1 <- readfun(outfile)
  read_time <- as.numeric(Sys.time()) - read_time
  fs <- file.info(outfile)$size
  rm(x1)
  gc()
  data.frame(time, fs, read_time)
}) %>% rbindlist
resb <- cbind(gridb, resb)

res <- res %>% mutate(method = sprintf("qs:%s shuffle=%s", algo, shuf))
res2 <- res2 %>% mutate(method = paste0("fst"))
resb <- resb %>% mutate(method = paste0("saveRDS / readRDS"))

rf <- rbind(res %>% dplyr::select(method, threads, time, read_time, fs, cl),
            res2 %>% dplyr::select(method, threads, time, read_time, fs, cl),
            resb %>% dplyr::select(method, threads, time, read_time, fs, cl)) %>%
  mutate(fs = fs/1e6)

saveRDS(rf, file="bench_data_v20.1.rds")

#########################
rf <- readRDS("bench_data_v20.1.rds")

rf <-  filter(rf, !(method %like% "qs" & cl == 120)) %>% filter(!(method %like% "fst" & cl == 15)) %>%
  group_by(method, threads, cl) %>%
  summarize(time = mean(time), read_time=mean(read_time), fs = mean(fs))
# rf$threads <- ifelse(threads == 1, "1 thread", "4 threads")

rfavglab <- rf %>% filter(threads==1) %>%
  group_by(method, cl) %>% summarize(time = mean(time), read_time=mean(read_time), fs = mean(fs)) %>%
  filter( (method == "qs:lz4 shuffle=0" & cl == 150) | (method == "qs:lz4 shuffle=7" & cl == 1) | (method == "qs:zstd shuffle=7" & cl == 4)) %>%
  arrange(method) %>% ungroup %>%
  mutate(label = c("fast", "balanced", "high"))


# rfavg <- rf %>%
#   group_by(method, cl) %>% summarize(time = mean(time), fs = mean(fs))

g <- ggplot() +
  geom_point(data = rf, mapping=aes(x=fs, color=method, y=time, shape = factor(threads)), alpha=0.5, show.legend = T) +
  # geom_spline(data = rf, mapping=aes(x=fs, color=method, y=time, lty=factor(threads)), nknots=5, spar = 0.25, lwd=.81) +
  # geom_line(data = rf, mapping=aes(x=fs, color=method, y=time, lty=factor(threads)), lwd=.81) +
  geom_point(data=rfavglab, mapping=aes(x = fs, y = time), color="black", alpha=1, show.legend=F) +
  geom_text(data=rfavglab, mapping=aes(x = fs, y = time+.05, label = label), color="black", size=3.5, show.legend=F) +
  theme_ipsum_rc() +
  theme(plot.margin = unit(rep(.5,4), "lines"),
        # legend.position="bottom",
        legend.key.size = unit(1, 'lines'),
        legend.box.just = "left",
        legend.text = element_text(margin = margin(r = 1, unit = "lines")),
        axis.title.x = element_text(size=rel(1.3)),
        axis.title.y = element_text(size=rel(1.3))) +
  labs(x = "File size (MB)", y = "Time (s)", color = "Algorithm", shape = "Threads") +
  scale_y_log10(breaks=c(.1,.2,.5,1,2,5,10,20,50)) # +
  # scale_x_sqrt(limits=c(NA,NA), breaks=seq(0,500,by=12)) +
  # guides(color=guide_legend(nrow=2,byrow=T))
plot(g)
ggsave(g, file = "df_bench_complete_write_v20.1.png", width=6.5, height=4.5, dpi=300)

g <- ggplot() +
  geom_point(data = rf, mapping=aes(x=fs, color=method, y=read_time, shape = factor(threads)), alpha=0.5, show.legend = F) +
  # geom_spline(data = rf, mapping=aes(x=fs, color=method, y=read_time, lty=factor(threads)), nknots=5, spar = 0.25, lwd=.81) +
  geom_line(data = rf, mapping=aes(x=fs, color=method, y=read_time, lty=factor(threads)), lwd=.81) +
  geom_point(data=rfavglab, mapping=aes(x = fs, y = read_time), color="black", alpha=1) +
  geom_text(data=rfavglab, mapping=aes(x = fs, y = read_time+.05, label = label), color="black", size=3.5) +
  theme_ipsum_rc() +
  theme(plot.margin = unit(rep(.5,4), "lines"), legend.position="right",
        legend.key.size = unit(1, 'lines'),
        legend.box.just = "left",
        legend.text = element_text(margin = margin(r = 1, unit = "lines")),
        axis.title.x = element_text(size=rel(1.3)),
        axis.title.y = element_text(size=rel(1.3))) +
  labs(x = "File size (MB)", y = "Time (s)", color = "Algorithm", lty = "Threads") +
  scale_y_log10(breaks=c(.1,.2,.5,1,2,5,10,20,50)) # +
  # scale_x_sqrt(limits=c(NA,NA), breaks=seq(0,500,by=12)) +
  # guides(color=guide_legend(nrow=2,byrow=T))

plot(g)
ggsave(g, file = "df_bench_complete_read_v20.1.png", width=6.5, height=4.5, dpi=300)


sumtab <- rf %>%
  group_by(method, cl, threads) %>% summarize(`Write Time (s)` = mean(time), `Read Time (s)`=mean(read_time), `File Size (Mb)` = mean(fs)) %>%
  filter( (method == "qs:lz4 shuffle=0" & cl == 100) |
            (method == "qs:lz4 shuffle=7" & cl == 1) |
            (method == "qs:zstd shuffle=7" & cl == 4) |
            (method == "qs:zstd_stream shuffle=7" & cl == 14) |
            (method == "fst" & cl == 0) |
            (method == "fst" & cl == 50) |
            (method == "fst" & cl == 85) |
            (method == "saveRDS / readRDS") ) %>%
  dplyr::rename(Algorithm = method, Threads = threads) %>% ungroup

sumtab <- sumtab %>%
  mutate(Algorithm = ifelse(Algorithm != "saveRDS / readRDS", sprintf("%s C=%s", Algorithm, cl), Algorithm)) %>%
  dplyr::select(Algorithm, everything()) %>% dplyr::select(-cl)

sumtab$Algorithm[7] <- paste(sumtab$Algorithm[7], "(fast)")
sumtab$Algorithm[9] <- paste(sumtab$Algorithm[9], "(balanced)")
sumtab$Algorithm[11] <- paste(sumtab$Algorithm[11], "(high)")
sumtab$Algorithm[13] <- paste(sumtab$Algorithm[13], "(archive)")

sumtab <- rbind(sumtab[14:15, ], sumtab[1:13, ])
# write.csv(sumtab, file="df_bench_summary.csv", row.names=F)
	library(qs)
	library(data.table)
	library(dplyr)
	library(ggplot2)
	library(fst)
	library(patchwork)
	library(ggformula)
	library(hrbrthemes)
	library(Rcpp)
	library(trqwe)

	outfile <- "~/N/test.z" # Ubuntu WSL
	# outfile <- "N:/test.z" # Windows
	# outfile <- "/tmp/test.z" # Mac
	nr <- 5e6
	reps <- 1

	mserialize <- function (object, file, mc.cores = min(parallel::detectCores(), 4))
	{
	con <- pipe(paste0("pigz -p", mc.cores, " > ", file), "wb")
	serialize(object, con, xdr=F)
	close(con)
	}
	lserialize <- function(object, file, mc.cores = min(parallel::detectCores(), 4))
	{
	con <- file(file, "wb")
	serialize(object, con, xdr=F)
	close(con)
	}

	generateData <- function() {
	data.frame(a=rnorm(nr), b=rpois(100,nr),
	c=sample(starnames[["IAU Name"]],nr,T), d=factor(sample(state.name,nr,T)), stringsAsFactors = F)
	}

	grid <- expand.grid(algo = "lz4", cl = c(1:9,seq(10,150,10)), shuf=c(0, 7), threads=c(1,4), rep=reps, stringsAsFactors = F)
	grid3 <- expand.grid(algo = "zstd", cl = c(seq(-50,-10,by=5),-5,-4,-3,-2,-1,1:15), shuf=c(7), threads=c(1,4), rep=reps, stringsAsFactors = F)
	grid4 <- expand.grid(algo = "zstd_stream", cl = c(seq(-50,-10,by=5),-5,-4,-3,-2,-1,1:15), shuf=c(7), threads=c(1), rep=reps, stringsAsFactors = F)
	grid <- rbind(grid, grid3, grid4)
	grid <- sample_n(grid, nrow(grid))

	res <- lapply(1:nrow(grid), function(i) {
	print(grid[i,])
	x1 <- generateData()
	time <- as.numeric(Sys.time())
	qsave(x1, file=outfile, preset = "custom", compress_level=grid$cl[i], shuffle_control = grid$shuf[i], algorithm = grid$algo[i], nthreads=grid$threads[i], check_hash = F)
	time <- as.numeric(Sys.time()) - time
	rm(x1)
	gc()
	read_time <- as.numeric(Sys.time())
	x1 <- qread(file=outfile)
	read_time <- as.numeric(Sys.time()) - read_time
	fs <- file.info(outfile)$size
	rm(x1)
	gc()
	data.frame(time, fs, read_time)
	}) %>% rbindlist
	res <- cbind(grid, res)


	grid2 <- expand.grid(algo = "fst", cl = seq(0,85,by=5), threads = c(1,4), rep=reps)
	grid2 <- sample_n(grid2, nrow(grid2))
	res2 <- lapply(1:nrow(grid2), function(i) {
	print(grid2[i,])
	x1 <- generateData()
	time <- as.numeric(Sys.time())
	threads_fst(grid2$threads[i])
	write_fst(x1, path=outfile, compress = grid2$cl[i])
	time <- as.numeric(Sys.time()) - time
	rm(x1)
	gc()
	read_time <- as.numeric(Sys.time())
	x1 <- read_fst(outfile)
	read_time <- as.numeric(Sys.time()) - read_time
	fs <- file.info(outfile)$size
	rm(x1)
	gc()
	data.frame(time, fs, read_time)
	}) %>% rbindlist
	res2 <- cbind(grid2, res2)

	gridb <- expand.grid(algo = "saveRDS", threads = c(1), cl = NA, rep=reps)
	gridb <- sample_n(gridb, nrow(gridb))
	resb <- lapply(1:nrow(gridb), function(i) {
	savefun <- ifelse(gridb$threads[i] == 1, lserialize, mserialize)
	readfun <- ifelse(gridb$threads[i] == 1, readRDS, mcreadRDS)
	print(gridb[i,])
	x1 <- generateData()
	time <- as.numeric(Sys.time())
	savefun(x1, outfile)
	time <- as.numeric(Sys.time()) - time
	rm(x1)
	gc()
	read_time <- as.numeric(Sys.time())
	x1 <- readfun(outfile)
	read_time <- as.numeric(Sys.time()) - read_time
	fs <- file.info(outfile)$size
	rm(x1)
	gc()
	data.frame(time, fs, read_time)
	}) %>% rbindlist
	resb <- cbind(gridb, resb)

	res <- res %>% mutate(method = sprintf("qs:%s shuffle=%s", algo, shuf))
	res2 <- res2 %>% mutate(method = paste0("fst"))
	resb <- resb %>% mutate(method = paste0("saveRDS / readRDS"))

	rf <- rbind(res %>% dplyr::select(method, threads, time, read_time, fs, cl),
	res2 %>% dplyr::select(method, threads, time, read_time, fs, cl),
	resb %>% dplyr::select(method, threads, time, read_time, fs, cl)) %>%
	mutate(fs = fs/1e6)

	saveRDS(rf, file="bench_data_v20.1.rds")

	#########################
	rf <- readRDS("bench_data_v20.1.rds")

	rf <- filter(rf, !(method %like% "qs" & cl == 120)) %>% filter(!(method %like% "fst" & cl == 15)) %>%
	group_by(method, threads, cl) %>%
	summarize(time = mean(time), read_time=mean(read_time), fs = mean(fs))
	# rf$threads <- ifelse(threads == 1, "1 thread", "4 threads")

	rfavglab <- rf %>% filter(threads==1) %>%
	group_by(method, cl) %>% summarize(time = mean(time), read_time=mean(read_time), fs = mean(fs)) %>%
	filter( (method == "qs:lz4 shuffle=0" & cl == 150) \| (method == "qs:lz4 shuffle=7" & cl == 1) \| (method == "qs:zstd shuffle=7" & cl == 4)) %>%
	arrange(method) %>% ungroup %>%
	mutate(label = c("fast", "balanced", "high"))


	# rfavg <- rf %>%
	# group_by(method, cl) %>% summarize(time = mean(time), fs = mean(fs))

	g <- ggplot() +
	geom_point(data = rf, mapping=aes(x=fs, color=method, y=time, shape = factor(threads)), alpha=0.5, show.legend = T) +
	# geom_spline(data = rf, mapping=aes(x=fs, color=method, y=time, lty=factor(threads)), nknots=5, spar = 0.25, lwd=.81) +
	# geom_line(data = rf, mapping=aes(x=fs, color=method, y=time, lty=factor(threads)), lwd=.81) +
	geom_point(data=rfavglab, mapping=aes(x = fs, y = time), color="black", alpha=1, show.legend=F) +
	geom_text(data=rfavglab, mapping=aes(x = fs, y = time+.05, label = label), color="black", size=3.5, show.legend=F) +
	theme_ipsum_rc() +
	theme(plot.margin = unit(rep(.5,4), "lines"),
	# legend.position="bottom",
	legend.key.size = unit(1, 'lines'),
	legend.box.just = "left",
	legend.text = element_text(margin = margin(r = 1, unit = "lines")),
	axis.title.x = element_text(size=rel(1.3)),
	axis.title.y = element_text(size=rel(1.3))) +
	labs(x = "File size (MB)", y = "Time (s)", color = "Algorithm", shape = "Threads") +
	scale_y_log10(breaks=c(.1,.2,.5,1,2,5,10,20,50)) # +
	# scale_x_sqrt(limits=c(NA,NA), breaks=seq(0,500,by=12)) +
	# guides(color=guide_legend(nrow=2,byrow=T))
	plot(g)
	ggsave(g, file = "df_bench_complete_write_v20.1.png", width=6.5, height=4.5, dpi=300)

	g <- ggplot() +
	geom_point(data = rf, mapping=aes(x=fs, color=method, y=read_time, shape = factor(threads)), alpha=0.5, show.legend = F) +
	# geom_spline(data = rf, mapping=aes(x=fs, color=method, y=read_time, lty=factor(threads)), nknots=5, spar = 0.25, lwd=.81) +
	geom_line(data = rf, mapping=aes(x=fs, color=method, y=read_time, lty=factor(threads)), lwd=.81) +
	geom_point(data=rfavglab, mapping=aes(x = fs, y = read_time), color="black", alpha=1) +
	geom_text(data=rfavglab, mapping=aes(x = fs, y = read_time+.05, label = label), color="black", size=3.5) +
	theme_ipsum_rc() +
	theme(plot.margin = unit(rep(.5,4), "lines"), legend.position="right",
	legend.key.size = unit(1, 'lines'),
	legend.box.just = "left",
	legend.text = element_text(margin = margin(r = 1, unit = "lines")),
	axis.title.x = element_text(size=rel(1.3)),
	axis.title.y = element_text(size=rel(1.3))) +
	labs(x = "File size (MB)", y = "Time (s)", color = "Algorithm", lty = "Threads") +
	scale_y_log10(breaks=c(.1,.2,.5,1,2,5,10,20,50)) # +
	# scale_x_sqrt(limits=c(NA,NA), breaks=seq(0,500,by=12)) +
	# guides(color=guide_legend(nrow=2,byrow=T))

	plot(g)
	ggsave(g, file = "df_bench_complete_read_v20.1.png", width=6.5, height=4.5, dpi=300)



	sumtab <- rf %>%
	group_by(method, cl, threads) %>% summarize(`Write Time (s)` = mean(time), `Read Time (s)`=mean(read_time), `File Size (Mb)` = mean(fs)) %>%
	filter( (method == "qs:lz4 shuffle=0" & cl == 100) \|
	(method == "qs:lz4 shuffle=7" & cl == 1) \|
	(method == "qs:zstd shuffle=7" & cl == 4) \|
	(method == "qs:zstd_stream shuffle=7" & cl == 14) \|
	(method == "fst" & cl == 0) \|
	(method == "fst" & cl == 50) \|
	(method == "fst" & cl == 85) \|
	(method == "saveRDS / readRDS") ) %>%
	dplyr::rename(Algorithm = method, Threads = threads) %>% ungroup

	sumtab <- sumtab %>%
	mutate(Algorithm = ifelse(Algorithm != "saveRDS / readRDS", sprintf("%s C=%s", Algorithm, cl), Algorithm)) %>%
	dplyr::select(Algorithm, everything()) %>% dplyr::select(-cl)

	sumtab$Algorithm[7] <- paste(sumtab$Algorithm[7], "(fast)")
	sumtab$Algorithm[9] <- paste(sumtab$Algorithm[9], "(balanced)")
	sumtab$Algorithm[11] <- paste(sumtab$Algorithm[11], "(high)")
	sumtab$Algorithm[13] <- paste(sumtab$Algorithm[13], "(archive)")

	sumtab <- rbind(sumtab[14:15, ], sumtab[1:13, ])
	# write.csv(sumtab, file="df_bench_summary.csv", row.names=F)