Skip to content

Instantly share code, notes, and snippets.

@amatsuo
Last active March 28, 2019 09:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save amatsuo/e5def48d5a5642520c180fe2686dd2c6 to your computer and use it in GitHub Desktop.
Save amatsuo/e5def48d5a5642520c180fe2686dd2c6 to your computer and use it in GitHub Desktop.
Scalability comparison of quanteda and spacyr tokenizers
library(quanteda)
library(spacyr)
library(tidyverse)
library(microbenchmark)
spacy_initialize()
data_text_irishbudget2010 <- texts(data_corpus_irishbudget2010)
data_text_irishbudget2010 <- unname(data_text_irishbudget2010)
bench_scalability <-
microbenchmark(
spacyr_ib_1 = spacy_tokenize(data_text_irishbudget2010),
spacyr_ib_2 = spacy_tokenize(rep(data_text_irishbudget2010, 2)),
spacyr_ib_5 = spacy_tokenize(rep(data_text_irishbudget2010, 5)),
spacyr_ib_10 = spacy_tokenize(rep(data_text_irishbudget2010, 10)),
spacyr_ib_50 = spacy_tokenize(rep(data_text_irishbudget2010, 50)),
spacyr_ib_100 = spacy_tokenize(rep(data_text_irishbudget2010, 100)),
quanteda_ib_1 = tokens(data_text_irishbudget2010),
quanteda_ib_2 = tokens(rep(data_text_irishbudget2010, 2)),
quanteda_ib_5 = tokens(rep(data_text_irishbudget2010, 5)),
quanteda_ib_10 = tokens(rep(data_text_irishbudget2010, 10)),
quanteda_ib_50 = tokens(rep(data_text_irishbudget2010, 50)),
quanteda_ib_100 = tokens(rep(data_text_irishbudget2010, 100)),
times = 5
)
bench_scalability
bench_scalability %>% group_by(expr) %>%
summarize(avg_time = mean(time) / 1e9) %>%
separate(expr, c("package", "text", "n_repeat"), "_") %>%
mutate(n_repeat = as.numeric(n_repeat)) %>%
ggplot(aes(x = n_repeat, y = avg_time, group = package, color = package)) +
geom_line() +
scale_x_log10("Corpus Size") +
scale_y_log10("Average time (sec)") +
theme_minimal()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment