Last active
November 10, 2022 22:47
-
-
Save lvalnegri/6587c2bce82b6ca7d0d4257660e76619 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# NYC Taxi TLC Trip Record Data | |
# https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page | |
# year 2011, 176mln rows, 12GB RAM | |
setwd('./Rstorage') | |
nth <- parallel::detectCores() - 2 # Ryzen 9 5950X 32 cores (=> nth = 30) | |
data.table::setDTthreads(nth) | |
y <- fst::read_fst(file.path(Rfuns::data_path, 'us', 'nyc_taxi', '2011'), as.data.table = TRUE) | |
# csv: 11.2GB (zipped: GB) | |
# rds default: 1.4GB | |
# rds uncompressed: 11.2GB | |
# fst: 3GB | |
# qs: 2.2GB | |
# parquet: 1.4 | |
microbenchmark::microbenchmark( | |
'dt1' = { data.table::setDTthreads(1); data.table::fwrite(y, '2011.csv') }, | |
'dtx' = { data.table::setDTthreads(nth); data.table::fwrite(y, '2011.csv') }, | |
'rdr' = readr::write_csv(y, '2011.csv'), | |
'rds' = saveRDS(y, '2011.rds'), | |
'rnc' = saveRDS(y, '2011.rnc', compress = FALSE), | |
'fst' = fst::write_fst(y, '2011.fst'), | |
'qs1' = qs::qsave(y, '2011.qs'), | |
'qsx' = qs::qsave(y, '2011.qs', nthreads = nth), | |
'prq' = arrow::write_parquet(y, '2011.pq'), | |
times = 5 | |
) | |
# Unit: seconds | |
# expr min lq mean median uq max neval cld | |
# dt1 43.506698 43.601451 60.065286 59.142174 76.529121 78.470098 4 e | |
# dtx 3.885360 5.071990 5.783191 6.360932 6.494393 6.525542 4 ab | |
# rdr 20.462667 20.822410 21.702479 21.876254 22.582548 22.594739 4 cd | |
# rds 310.219050 314.627363 317.984075 319.508269 321.340787 322.700713 4 f | |
# rnc 11.780769 13.585253 14.649857 15.530841 15.714461 15.756976 4 a c | |
# fst 1.730701 2.188686 2.454120 2.676605 2.719554 2.732569 4 a | |
# qs1 35.756636 36.093977 36.608101 36.665860 37.122225 37.344047 4 d | |
# qsx 10.479202 10.589914 10.835403 10.874815 11.080891 11.112778 4 a c | |
# prq 18.032731 18.181001 18.515467 18.446190 18.849933 19.136758 4 bc | |
microbenchmark::microbenchmark( | |
'dt1' = { data.table::setDTthreads(1); data.table::fread('2011.csv') }, | |
'dtx' = { data.table::setDTthreads(nth); data.table::fread('2011.csv') }, | |
'rdr' = readr::read_csv('2011.csv'), | |
'rds' = readRDS('2011.rds'), | |
'rnc' = readRDS('2011.rnc'), | |
'fst' = fst::read_fst('2011.fst', as.data.table = TRUE), | |
'qs1' = qs::qread('2011.qs'), | |
'qs6' = qs::qread('2011.qs', nthreads = 6), | |
'qsx' = qs::qread('2011.qs', nthreads = nth), | |
'prq' = arrow::read_parquet('2011.prq'), | |
times = 10 | |
) | |
# Unit: milliseconds | |
# expr min lq mean median uq max neval cld | |
# dt1 15150.102 15384.9305 15805.230 15822.712 15950.497 16602.819 10 g | |
# dtx 1625.603 1804.4039 2053.498 2176.882 2234.223 2355.109 10 b | |
# rdr 23429.444 23672.0207 23752.863 23718.268 23903.303 24048.548 10 h | |
# rds 26042.728 27223.5948 27243.311 27459.466 27535.535 27862.663 10 i | |
# rnc 9994.286 10206.1144 10534.082 10593.441 10886.925 11118.656 10 f | |
# fst 892.296 905.2518 1144.050 1089.277 1282.975 1609.692 10 a | |
# qs1 9140.779 9514.6906 9603.860 9570.558 9667.102 10156.028 10 e | |
# qs6 5696.388 5773.2970 5906.207 5949.036 5953.662 6108.717 10 c | |
# qsx 7913.974 8072.5697 8346.366 8401.842 8429.219 8910.961 10 d | |
Rfuns::write_fst_idx('2011', cname = c('pu_month', 'pu_day'), dts = y) | |
microbenchmark::microbenchmark( | |
'idx' = data.table::rbindlist(lapply(1:10, \(x) Rfuns::read_fst_idx('2011', c(8, x)))), | |
'nox' = fst::read_fst('2011.fst', as.data.table = TRUE) %>% | |
{.[pu_month == 8 & pu_day <= 10]}, | |
'prq' = arrow::open_dataset('2011.prq') |> | |
dplyr::filter(pu_month == 8 & pu_day <= 10) |> | |
dplyr::collect(), | |
times = 10 | |
) | |
# Unit: milliseconds | |
# expr min lq mean median uq max neval cld | |
# idx 144.7218 145.1878 156.8216 153.5852 164.5233 186.6884 10 a | |
# nox 1805.2652 1890.6643 2122.4655 2077.5755 2357.2330 2559.6328 10 b | |
# prq 239.7660 242.6931 248.3015 247.7814 249.1476 261.9594 10 a | |
file.remove(paste0('2011')) | |
lapply(c('idx', 'csv', 'fst', 'qs', 'prq', 'rds', 'rnc'), \(x) file.remove(paste0('2011.', x))) | |
rm(list = ls()) | |
gc() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment