jimhester/gun-violence.R

## gun-violence.R
library(dtplyr)
library(readr)
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(data.table))

# It is generally better not to benchmark the print methods to avoid misleading
# results, also vroom is faster on this particular dataset than
# either readr or data.table

bench::mark(
  readr = readr::read_csv("~/Downloads/gun-violence-data_01-2013_03-2018.csv", progress = FALSE, col_types = list()) %>%
    group_by(state) %>%
    count(sort = TRUE),

  fread = fread("~/Downloads/gun-violence-data_01-2013_03-2018.csv") %>%
    lazy_dt() %>%
    group_by(state) %>%
    count(sort = TRUE) %>%
    as.data.table(),

  vroom = vroom::vroom("~/Downloads/gun-violence-data_01-2013_03-2018.csv", progress = FALSE, col_types = list()) %>%
    group_by(state) %>%
    count(sort = TRUE),

  iterations = 5
)
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 3 x 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 readr         2.24s    2.57s     0.398   155.8MB     1.11
#> 2 fread      716.57ms 737.69ms     1.03    121.6MB     1.03
#> 3 vroom      229.19ms 238.72ms     3.10     24.4MB     1.24

# lazy_dt called on a tibble or data.frame will convert it to a data.table,
# which takes considerable time, to avoid including this in the results of the
# group-wise counts you need to call lazy_dt before the benchmark.

dat_readr <- readr::read_csv("~/Downloads/gun-violence-data_01-2013_03-2018.csv", progress = FALSE, col_types = list())

dat_readr_dt <- lazy_dt(dat_readr)

dat_dt <- lazy_dt(fread("~/Downloads/gun-violence-data_01-2013_03-2018.csv"))

bench::mark(
  readr = dat_readr %>% group_by(state) %>% count(sort = TRUE),
  "readr-dtplyr" = dat_readr_dt %>% group_by(state) %>% count(sort = TRUE) %>% as_tibble(),
  "fread-dtplyr" = dat_dt %>% group_by(state) %>% count(sort = TRUE) %>% as_tibble()
)
#> # A tibble: 3 x 6
#>   expression        min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>   <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 readr         13.61ms  14.14ms      69.9    1.84MB     0
#> 2 readr-dtplyr   4.83ms   5.56ms     177.     1.08MB     2.73
#> 3 fread-dtplyr    4.9ms   5.45ms     180.     1.03MB     2.91

#> Created on 2019-11-14 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)
	library(dtplyr)
	library(readr)
	suppressPackageStartupMessages(library(dplyr))
	suppressPackageStartupMessages(library(data.table))

	# It is generally better not to benchmark the print methods to avoid misleading
	# results, also vroom is faster on this particular dataset than
	# either readr or data.table

	bench::mark(
	readr = readr::read_csv("~/Downloads/gun-violence-data_01-2013_03-2018.csv", progress = FALSE, col_types = list()) %>%
	group_by(state) %>%
	count(sort = TRUE),

	fread = fread("~/Downloads/gun-violence-data_01-2013_03-2018.csv") %>%
	lazy_dt() %>%
	group_by(state) %>%
	count(sort = TRUE) %>%
	as.data.table(),

	vroom = vroom::vroom("~/Downloads/gun-violence-data_01-2013_03-2018.csv", progress = FALSE, col_types = list()) %>%
	group_by(state) %>%
	count(sort = TRUE),

	iterations = 5
	)
	#> Warning: Some expressions had a GC in every iteration; so filtering is
	#> disabled.
	#> # A tibble: 3 x 6
	#> expression min median `itr/sec` mem_alloc `gc/sec`
	#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
	#> 1 readr 2.24s 2.57s 0.398 155.8MB 1.11
	#> 2 fread 716.57ms 737.69ms 1.03 121.6MB 1.03
	#> 3 vroom 229.19ms 238.72ms 3.10 24.4MB 1.24

	# lazy_dt called on a tibble or data.frame will convert it to a data.table,
	# which takes considerable time, to avoid including this in the results of the
	# group-wise counts you need to call lazy_dt before the benchmark.

	dat_readr <- readr::read_csv("~/Downloads/gun-violence-data_01-2013_03-2018.csv", progress = FALSE, col_types = list())

	dat_readr_dt <- lazy_dt(dat_readr)

	dat_dt <- lazy_dt(fread("~/Downloads/gun-violence-data_01-2013_03-2018.csv"))

	bench::mark(
	readr = dat_readr %>% group_by(state) %>% count(sort = TRUE),
	"readr-dtplyr" = dat_readr_dt %>% group_by(state) %>% count(sort = TRUE) %>% as_tibble(),
	"fread-dtplyr" = dat_dt %>% group_by(state) %>% count(sort = TRUE) %>% as_tibble()
	)
	#> # A tibble: 3 x 6
	#> expression min median `itr/sec` mem_alloc `gc/sec`
	#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
	#> 1 readr 13.61ms 14.14ms 69.9 1.84MB 0
	#> 2 readr-dtplyr 4.83ms 5.56ms 177. 1.08MB 2.73
	#> 3 fread-dtplyr 4.9ms 5.45ms 180. 1.03MB 2.91

	#> Created on 2019-11-14 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)