Skip to content

Instantly share code, notes, and snippets.

@lvalnegri
Last active November 10, 2022 22:47
Show Gist options
  • Save lvalnegri/6587c2bce82b6ca7d0d4257660e76619 to your computer and use it in GitHub Desktop.
Save lvalnegri/6587c2bce82b6ca7d0d4257660e76619 to your computer and use it in GitHub Desktop.
# NYC Taxi TLC Trip Record Data
# https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page
# year 2011, 176mln rows, 12GB RAM
setwd('./Rstorage')
nth <- parallel::detectCores() - 2 # Ryzen 9 5950X 32 cores (=> nth = 30)
data.table::setDTthreads(nth)
y <- fst::read_fst(file.path(Rfuns::data_path, 'us', 'nyc_taxi', '2011'), as.data.table = TRUE)
# csv: 11.2GB (zipped: GB)
# rds default: 1.4GB
# rds uncompressed: 11.2GB
# fst: 3GB
# qs: 2.2GB
# parquet: 1.4
microbenchmark::microbenchmark(
'dt1' = { data.table::setDTthreads(1); data.table::fwrite(y, '2011.csv') },
'dtx' = { data.table::setDTthreads(nth); data.table::fwrite(y, '2011.csv') },
'rdr' = readr::write_csv(y, '2011.csv'),
'rds' = saveRDS(y, '2011.rds'),
'rnc' = saveRDS(y, '2011.rnc', compress = FALSE),
'fst' = fst::write_fst(y, '2011.fst'),
'qs1' = qs::qsave(y, '2011.qs'),
'qsx' = qs::qsave(y, '2011.qs', nthreads = nth),
'prq' = arrow::write_parquet(y, '2011.pq'),
times = 5
)
# Unit: seconds
# expr min lq mean median uq max neval cld
# dt1 43.506698 43.601451 60.065286 59.142174 76.529121 78.470098 4 e
# dtx 3.885360 5.071990 5.783191 6.360932 6.494393 6.525542 4 ab
# rdr 20.462667 20.822410 21.702479 21.876254 22.582548 22.594739 4 cd
# rds 310.219050 314.627363 317.984075 319.508269 321.340787 322.700713 4 f
# rnc 11.780769 13.585253 14.649857 15.530841 15.714461 15.756976 4 a c
# fst 1.730701 2.188686 2.454120 2.676605 2.719554 2.732569 4 a
# qs1 35.756636 36.093977 36.608101 36.665860 37.122225 37.344047 4 d
# qsx 10.479202 10.589914 10.835403 10.874815 11.080891 11.112778 4 a c
# prq 18.032731 18.181001 18.515467 18.446190 18.849933 19.136758 4 bc
microbenchmark::microbenchmark(
'dt1' = { data.table::setDTthreads(1); data.table::fread('2011.csv') },
'dtx' = { data.table::setDTthreads(nth); data.table::fread('2011.csv') },
'rdr' = readr::read_csv('2011.csv'),
'rds' = readRDS('2011.rds'),
'rnc' = readRDS('2011.rnc'),
'fst' = fst::read_fst('2011.fst', as.data.table = TRUE),
'qs1' = qs::qread('2011.qs'),
'qs6' = qs::qread('2011.qs', nthreads = 6),
'qsx' = qs::qread('2011.qs', nthreads = nth),
'prq' = arrow::read_parquet('2011.prq'),
times = 10
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# dt1 15150.102 15384.9305 15805.230 15822.712 15950.497 16602.819 10 g
# dtx 1625.603 1804.4039 2053.498 2176.882 2234.223 2355.109 10 b
# rdr 23429.444 23672.0207 23752.863 23718.268 23903.303 24048.548 10 h
# rds 26042.728 27223.5948 27243.311 27459.466 27535.535 27862.663 10 i
# rnc 9994.286 10206.1144 10534.082 10593.441 10886.925 11118.656 10 f
# fst 892.296 905.2518 1144.050 1089.277 1282.975 1609.692 10 a
# qs1 9140.779 9514.6906 9603.860 9570.558 9667.102 10156.028 10 e
# qs6 5696.388 5773.2970 5906.207 5949.036 5953.662 6108.717 10 c
# qsx 7913.974 8072.5697 8346.366 8401.842 8429.219 8910.961 10 d
Rfuns::write_fst_idx('2011', cname = c('pu_month', 'pu_day'), dts = y)
microbenchmark::microbenchmark(
'idx' = data.table::rbindlist(lapply(1:10, \(x) Rfuns::read_fst_idx('2011', c(8, x)))),
'nox' = fst::read_fst('2011.fst', as.data.table = TRUE) %>%
{.[pu_month == 8 & pu_day <= 10]},
'prq' = arrow::open_dataset('2011.prq') |>
dplyr::filter(pu_month == 8 & pu_day <= 10) |>
dplyr::collect(),
times = 10
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# idx 144.7218 145.1878 156.8216 153.5852 164.5233 186.6884 10 a
# nox 1805.2652 1890.6643 2122.4655 2077.5755 2357.2330 2559.6328 10 b
# prq 239.7660 242.6931 248.3015 247.7814 249.1476 261.9594 10 a
file.remove(paste0('2011'))
lapply(c('idx', 'csv', 'fst', 'qs', 'prq', 'rds', 'rnc'), \(x) file.remove(paste0('2011.', x)))
rm(list = ls())
gc()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment