Created
August 3, 2023 19:42
-
-
Save beniaminogreen/ce284a7fcb59d7d759f7f8cdc8b161f6 to your computer and use it in GitHub Desktop.
small benchmark for zoomerjoin
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(zoomerjoin) | |
library(fuzzyjoin) | |
library(tidyverse) | |
library(microbenchmark) | |
library(profmem) | |
# Sample million rows from DIME dataset | |
data_1 <- as.data.frame(sample_n(dime_data, 10^6)) | |
names(data_1) <- c("id_1", "name") | |
data_2 <- as.data.frame(sample_n(dime_data, 10^6)) | |
names(data_2) <- c("id_2", "name") | |
# Generate datasets for euclidean join benchmarking | |
n <- 10^5 | |
p <- 50 | |
X <- matrix(rnorm(n * p), n, p) | |
X_1 <- as.data.frame(X) | |
X_2 <- as.data.frame(X + .000000001) | |
# Get time and memory use statistics for fuzzyjoin when performing jaccard join | |
fuzzy_jaccard_bench <- function(n){ | |
time <- microbenchmark( | |
stringdist_inner_join(data_1[1:n, ], | |
data_2[1:n, ], | |
method = "jaccard", | |
max_dist = .6, | |
q = 4 | |
), | |
times = 4 | |
)$time %>% | |
median() | |
mem <- profmem(stringdist_inner_join(data_1[1:n, ], | |
data_2[1:n, ], | |
method = "jaccard", | |
max_dist = .6, | |
q = 4 | |
)) %>% | |
total() | |
return(c(time = time, memory = mem)) | |
} | |
# Get time and memory use statistics for zoomerjoin when performing jaccard join | |
zoomer_jaccard_bench <- function(n) { | |
time <- microbenchmark( | |
jaccard_inner_join(data_1[1:n, ], data_2[1:n, ], | |
by = "name", band_width = 11, | |
n_bands = 350, threshold = .7, | |
n_gram_width = 4 | |
), | |
times = 4 | |
)$time %>% | |
median() | |
mem <- profmem( | |
jaccard_inner_join(data_1[1:n, ], data_2[1:n, ], | |
by = "name", band_width = 11, | |
n_bands = 350, threshold = .7, | |
n_gram_width = 4 | |
) | |
) %>% | |
total() | |
return(c(time = time, memory = mem)) | |
} | |
# Get time and memory use statistics for fuzzyjoin when performing Euclidean join | |
fuzzy_euclid_bench <- function(n) { | |
time <- microbenchmark( | |
distance_join(X_1[1:n, ], X_2[1:n, ], max_dist = .1, method = "euclidean"), | |
times = 4 | |
)$time %>% | |
median() | |
mem <- total(profmem( | |
distance_join(X_1[1:n, ], X_2[1:n, ], max_dist = .1, method = "euclidean") | |
)) | |
return(c(time = time, memory = mem)) | |
} | |
# Get time and memory use statistics for zoomerjoin when performing Euclidean join | |
zoomer_euclid_bench <- function(n) { | |
time <- microbenchmark( | |
euclidean_inner_join(X_1[1:n, ], X_2[1:n, ], | |
threshold = .1, n_bands = 90, | |
band_width = 2, r = .1 | |
), | |
times = 4 | |
)$time %>% | |
median() | |
mem <- profmem(euclidean_inner_join(X_1[1:n, ], X_2[1:n, ], | |
threshold = .1, n_bands = 90, | |
band_width = 2, r = .1 | |
)) %>% | |
total() | |
return(c(time = time, memory = mem)) | |
} | |
# Run Grid of Jaccard Benchmarks, Collect results into DF | |
n <- seq(100, 1000, 100) | |
names(n) <- n | |
fuzzy_jacard_benches <- map_df(n, fuzzy_jaccard_bench, .id="n") | |
zoomer_jacard_benches <- map_df(n, zoomer_jaccard_bench, .id="n") | |
fuzzy_jacard_benches$package <- "fuzzyjoin" | |
zoomer_jacard_benches$package <- "zoomerjoin" | |
jaccard_benches <- bind_rows(fuzzy_jacard_benches, zoomer_jacard_benches) | |
jaccard_benches$join_type <- "Jaccard Distance" | |
# Run Grid of Euclidean Benchmarks, Collect results into DF | |
n <- seq(100, 1000, 100) | |
names(n) <- n | |
fuzzy_euclid_benches <- map_df(n, fuzzy_euclid_bench, .id="n") | |
zoomer_euclid_benches <- map_df(n, zoomer_euclid_bench, .id="n") | |
fuzzy_euclid_benches$package <- "fuzzyjoin" | |
zoomer_euclid_benches$package <- "zoomerjoin" | |
euclid_benches <- bind_rows(fuzzy_euclid_benches, zoomer_euclid_benches) | |
euclid_benches$join_type <- "Euclidean Distance" | |
sim_data <- bind_rows(euclid_benches, jaccard_benches) %>% | |
pivot_longer(c(time, memory)) %>% | |
mutate(value = ifelse(name =="time", value / 10^9, value / 10^6)) # convert ns to s and bytes to Gb. | |
write_csv(sim_data, "sim_data.csv") | |
sim_data %>% | |
mutate( | |
name = ifelse(name == "time", "Time Usage (s)", "Memory Usage (MB)"), | |
join_type = ifelse(join_type == "Jaccard Distance", | |
"Jaccard Distance Join", | |
"Euclidean Distance Joins"), | |
) %>% | |
ggplot(aes(x=as.numeric(n), y=value, col = package, linetype = package)) + | |
geom_point() + | |
geom_line() + | |
facet_wrap(~ join_type + name, scales = 'free') + | |
scale_y_continuous("Time (s) / memory (MB)") | |
ggsave("bench_out.png") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment