Skip to content

Instantly share code, notes, and snippets.

@mikmart
Last active March 1, 2019 13:01
Show Gist options
  • Save mikmart/8ddc0cebec481eb907b2528add805d50 to your computer and use it in GitHub Desktop.
Save mikmart/8ddc0cebec481eb907b2528add805d50 to your computer and use it in GitHub Desktop.
Testing read/write times for data frames in different file formats
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(DBI)
library(MonetDBLite)
library(feather)
library(fst)

td <- tempdir()
setwd(td)

times <- 5
n <- 1e6 * 5
resample <- purrr::partial(sample, replace = TRUE)

df <- data.frame(
  time = Sys.time() + rnorm(n) * 1e7,
  bool = resample(c(TRUE, FALSE), n),
  int = resample(1000, n),
  dbl = rnorm(n, sd = 1000),
  chr = resample(letters, n)
)

monet <- dbConnect(MonetDBLite(), dbname = "monet")

microbenchmark::microbenchmark(
  saveRDS(df, "rds"),
  write_feather(df, "feather"),
  write_fst(df, "fst", compress = 0),
  dbWriteTable(monet, "df", df, overwrite = TRUE),
  times = times
)
#> Identifier(s) "time", "int" are reserved SQL keywords and need(s) to be quoted in queries.
#> Identifier(s) "time", "int" are reserved SQL keywords and need(s) to be quoted in queries.
#> Identifier(s) "time", "int" are reserved SQL keywords and need(s) to be quoted in queries.
#> Identifier(s) "time", "int" are reserved SQL keywords and need(s) to be quoted in queries.
#> Identifier(s) "time", "int" are reserved SQL keywords and need(s) to be quoted in queries.
#> Unit: milliseconds
#>                                             expr         min         lq
#>                               saveRDS(df, "rds") 11544.71686 11777.7724
#>                     write_feather(df, "feather")   325.20101   325.4623
#>               write_fst(df, "fst", compress = 0)    90.45394   108.2210
#>  dbWriteTable(monet, "df", df, overwrite = TRUE)  1570.86544  1677.5482
#>        mean     median         uq        max neval
#>  12511.1462 12195.1468 13067.0356 13971.0592     5
#>    341.5257   325.4959   349.1955   382.2738     5
#>    127.1958   113.2424   123.6935   200.3680     5
#>   1702.9170  1694.2976  1743.8645  1828.0091     5

microbenchmark::microbenchmark(
  readRDS("rds"),
  read_feather("feather"),
  read_fst("fst"),
  tbl(monet, "df") %>% collect(),
  times = times
)
#> Unit: milliseconds
#>                            expr       min        lq     mean   median
#>                  readRDS("rds") 829.08102 835.73890 852.4980 835.9805
#>         read_feather("feather") 130.47369 160.96860 187.1435 166.2264
#>                 read_fst("fst")  84.74794  97.05555 131.5531 113.2991
#>  tbl(monet, "df") %>% collect() 144.19685 214.83650 253.4896 240.0959
#>        uq      max neval
#>  844.9840 916.7057     5
#>  238.9650 239.0840     5
#>  137.4563 225.2067     5
#>  329.8362 338.4824     5

f <- function(data) filter(data, chr %in% c("a", "b"))

microbenchmark::microbenchmark(
  readRDS("rds") %>% f(),
  read_feather("feather") %>% f(),
  read_fst("fst") %>% f(), 
  tbl(monet, "df") %>% f() %>% collect(),
  df %>% f(),
  times = times
)
#> Unit: milliseconds
#>                                    expr      min        lq      mean
#>                  readRDS("rds") %>% f() 990.2467 1049.8131 1081.5837
#>         read_feather("feather") %>% f() 434.8617  441.3463  456.8683
#>                 read_fst("fst") %>% f() 325.0571  388.1029  387.8848
#>  tbl(monet, "df") %>% f() %>% collect() 186.9416  194.9822  201.6705
#>                              df %>% f() 193.6270  197.0525  208.6567
#>     median        uq       max neval
#>  1105.8664 1126.1228 1135.8693     5
#>   444.2382  464.8642  499.0311     5
#>   399.1494  412.0324  415.0822     5
#>   198.3381  204.4821  223.6084     5
#>   214.9407  217.1024  220.5610     5

dbDisconnect(monet, shutdown = TRUE)

Created on 2019-03-01 by the reprex package (v0.2.1.9000)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment