Skip to content

Instantly share code, notes, and snippets.

@hannes
Created Mar 7, 2022
Embed
What would you like to do?
library(ggplot2)
library(ggthemes)
library(dplyr)
library(directlabels)
our_theme <- theme_few(base_size = 24) +
theme(axis.title.y=element_text(vjust=0.9),
axis.title.x=element_text(vjust=-0.1),
axis.ticks.x=element_blank(),
text=element_text(family="serif"), legend.title = element_blank(),
legend.position = c(0.60, 0.70),
legend.justification = c("left", "bottom"),
legend.box.just = "left",
legend.margin = margin(0, 0, 0, 0), legend.background= element_blank())
aggr_results <- read.table("aggr-bench.tsv", sep="\t", col.names=c("nrow", "ngrp", "sys", "median_time_seconds"))
snames=c(pandas="Pandas", duckdb="DuckDB", polars="Polars", arrow="Arrow")
aggr_results <- aggr_results |>
filter(nrow < 1e9) |>
mutate(sys=snames[sys])
print(aggr_results)
breaks <- unique(aggr_results$ngrp)
labels <- parse(text=sprintf("10^%d", log10(breaks)))
svg("aggr-bench-groups.svg", height=5, width=8)
print(ggplot(
aggr_results %>% filter(nrow==1e8),
aes(x=ngrp, y=median_time_seconds, colour=sys, shape=sys)) +
geom_line(size=1.2) +
geom_point(size=3) +
our_theme +
ylab("Time (s)") +
xlab("# Groups") +
# ggtitle("Varying Group Count, 100M Rows") +
scale_x_log10(limits=c(1e3, 1e9), breaks=breaks, labels=labels) +
scale_y_log10() +
scale_colour_discrete(guide = 'none') +
scale_shape_discrete(guide = 'none') +
geom_dl(aes(label = sys), method = list("last.bumpup", cex=1.5, hjust = -.1, fontfamily='serif')))
dev.off()
breaks <- unique(aggr_results$nrow)
labels <- parse(text=sprintf("10^%d", log10(breaks)))
svg("aggr-bench-rows-fewgroups.svg", height=5, width=8)
print(ggplot(aggr_results %>%
filter(ngrp==1e3),
aes(x=nrow, y=median_time_seconds, color=sys, shape=sys)) +
geom_line(size=1.2) +
geom_point(size=3) +
our_theme +
ylab("Time (s)") +
xlab("# Rows") +
# ggtitle("Varying Row Count, ~1000 Groups") +
scale_x_log10(limits=c(1e6, 2*1e8), breaks=breaks, labels=labels) +
scale_y_log10() +
scale_colour_discrete(guide = 'none') +
scale_shape_discrete(guide = 'none') +
geom_dl(aes(label = sys), method = list("last.bumpup", cex=1.5, hjust = -.1, fontfamily='serif')))
dev.off()
breaks <- unique(aggr_results$nrow)
labels <- parse(text=sprintf("10^%d", log10(breaks)))
svg("aggr-bench-rows-manygroups.svg", height=5, width=8)
print(ggplot(aggr_results %>%
filter(ngrp==nrow),
aes(x=nrow, y=median_time_seconds, color=sys, shape=sys)) +
geom_line(size=1.2) +
geom_point(size=3) +
our_theme +
ylab("Time (s)") +
xlab("# Rows") +
# ggtitle("Varying Row Count, # Groups = # Rows") +
scale_x_log10(limits=c(1e6, 2*1e8), breaks=breaks, labels=labels) +
scale_y_log10() +
scale_colour_discrete(guide = 'none') +
scale_shape_discrete(guide = 'none') +
geom_dl(aes(label = sys), method = list("last.bumpup", cex=1.5, hjust = -.1, fontfamily='serif')))
dev.off()
# just explain difference between n and nlogn
n <- 1000 * 1:10
nlogn <- rbind(data.frame(n=n, O="O(n)", C=n), data.frame(n=n, O="O(nlogn)", C=n*log(n)))
svg("aggr-bench-nlogn.svg", height=5, width=8)
print(ggplot(nlogn,
aes(x=n, y=C, color=O, shape=O)) +
geom_line(size=1.2) +
geom_point(size=3) +
our_theme +
ylab("O") +
xlab("n") +
scale_x_continuous(limits=c(1000, 11500), breaks=c(1000, 10000)) +
scale_colour_discrete(guide = 'none') +
scale_shape_discrete(guide = 'none') +
geom_dl(aes(label = O), method = list("last.bumpup", cex=1.5, hjust = -.1, fontfamily='serif')))
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment