Created
November 8, 2022 20:41
-
-
Save lvalnegri/ac1904a93f79278b92810161886182fd to your computer and use it in GitHub Desktop.
For #rstats users interested in working with the #geoparquet format I've run a small benchmark comparing various formats for storage and ways of querying data, as well as different hardware The reference file is an #sf object, the complete Italian Census 2011 small areas, for a total of ~400K polygons with only an id column attached to the geom,…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# geoarrow: https://github.com/paleolimbot/geoarrow, remotes::install_github("paleolimbot/geoarrow") | |
# sfarrow: https://github.com/wcjochem/sfarrow, install.packages('sfarrow') | |
# qs: https://github.com/traversc/qs, install.packages("qs") | |
setwd('/home/datamaps/temp/Rgeobench/') | |
y <- readRDS('./0.rds') # 333.3MB | |
object.size(y) # 798,256,856 bytes - see Italian Census Tracts @ https://bit.ly/szn_cens | |
saveRDS(y, './0.rnc', compress = FALSE) # 541.2MB | |
sfarrow::st_write_parquet(y, './0.sfa') # 471.8MB | |
geoarrow::write_geoparquet(y, "./0.gar") # 471.8MB | |
qs::qsave(y, './0.qs', nthreads = 6) # 325.4MB | |
microbenchmark::microbenchmark( | |
'rds' = readRDS('./0.rds'), | |
'sfa' = sfarrow::st_read_parquet('./0.sfa'), | |
'gar' = geoarrow::read_geoparquet_sf('./0.gar'), | |
'qs2' = qs::qread('./0.qs', nthreads = 2), | |
'qs1' = qs::qread('./0.qs'), | |
'rnc' = readRDS('./0.rnc'), | |
times = 10 | |
) | |
# Unit: milliseconds [VDS Ryzen 9 5950X (32 physical threads)] | |
# expr min lq mean median uq max neval cld | |
# rds 2467.8933 2536.6780 2925.6897 2815.8789 3499.5137 3635.975 10 c | |
# sfa 7784.5293 7957.8702 8183.3330 8117.0573 8268.3980 9038.568 10 d | |
# gar 1646.3129 1699.0325 1803.7138 1750.3947 1867.7311 2194.139 10 b | |
# qs2 288.0859 401.8683 699.3778 625.6076 976.0137 1339.331 10 a | |
# qs1 426.8980 445.4319 745.4705 592.3386 1091.7139 1579.482 10 a | |
# rnc 693.2335 711.4279 817.4475 798.1594 897.0033 1016.470 10 a | |
# Unit: milliseconds [Desktop Ryzen 7 3700X (16 physical threads)] | |
# expr min lq mean median uq max neval | |
# rds 3628.2262 3692.5467 3890.180 3763.6072 3863.867 4506.057 10 | |
# sfa 12342.1899 12490.4987 12646.450 12672.6011 12826.803 12949.906 10 | |
# qs1 628.5944 799.2701 1094.869 927.5784 1671.162 1725.050 10 | |
# rnc 1313.6518 1328.4417 1553.817 1459.3336 1669.474 2483.823 10 | |
# Unit: milliseconds [VPS Xeon E3-1290 (10 virtual threads)] | |
# expr min lq mean median uq max neval cld | |
# rds 5391.4051 5420.054 5637.809 5532.758 5721.448 6197.399 10 c | |
# sfa 19905.4693 20454.637 21498.859 21518.111 22047.632 24406.320 10 d | |
# gar 3220.9154 3379.480 4316.381 3547.836 5459.431 6173.623 10 b | |
# qs1 864.9457 1013.159 1304.026 1239.925 1612.909 1721.541 10 a | |
# rnc 1780.7524 1826.183 2104.321 2047.655 2164.540 2727.607 10 a | |
yz <- fst::read_fst('./sezioni', as.data.table = TRUE)[CMN == 15146, SZN] # 6085 census tracts in Milan (code 15146) | |
microbenchmark::microbenchmark( | |
'gpq' = arrow::open_dataset('0.gar') |> dplyr::filter(SZN %in% yz) |> geoarrow::geoarrow_collect_sf(), | |
'sfs' = qs::qread('./0.qs', nthreads = 2) |> subset(SZN %in% yz), | |
'sfd' = qs::qread('./0.qs', nthreads = 2) |> dplyr::filter(SZN %in% yz), | |
'sf' = y |> subset(SZN %in% yz), | |
times = 10 | |
) | |
# Unit: milliseconds [VDS Ryzen 9 5950X (32 physical threads)] | |
# expr min lq mean median uq max neval | |
# gpq 285.0784 497.1767 531.8277 552.0051 567.7457 695.2356 10 b | |
# sfs 500.8068 502.8678 728.9501 615.3639 944.9022 1438.1080 10 b | |
# sfd 497.3435 531.6648 812.5998 732.6829 864.0335 1445.1170 10 b | |
# sf 213.8443 216.8596 218.8881 218.7319 221.5313 222.2413 10 a | |
# Unit: milliseconds [VPS Xeon E3-1290 (10 virtual threads)] | |
# expr min lq mean median uq max neval cld | |
# gpq 1144.1041 1214.0428 1295.2017 1292.0714 1339.7389 1522.627 10 a | |
# sfs 1585.7888 1620.3887 2486.9763 2460.6871 2826.7120 4238.592 10 b | |
# sfd 1526.0645 1711.5308 2147.7086 1821.9781 2051.1485 4248.069 10 b | |
# sf 632.9385 651.5985 688.5178 681.0304 710.5937 812.867 10 a |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment