library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
df = data.frame(id_col = sample(paste0(letters, seq(1, 2000000)), 1e6, replace = TRUE), value = rnorm(1e6))
microbenchmark::microbenchmark(
df_dup1 = df %>%
group_by(id_col) %>%
filter(n()>1),
df_dup2 = {
dup_entries = df$id_col[duplicated(df$id_col)]
df[df$id_col %in% dup_entries, ]
},
times = 5
)
#> Unit: milliseconds
#> expr min lq mean median uq max
#> df_dup1 19196.0949 19290.6953 19601.2547 19454.7607 20028.4515 20036.2711
#> df_dup2 250.7179 270.2355 459.7812 275.1833 708.6873 794.0822
#> neval
#> 5
#> 5
Created on 2020-11-16 by the reprex package (v0.3.0)
Session info
devtools::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#> setting value
#> version R version 4.0.0 (2020-04-24)
#> os Pop!_OS 20.04 LTS
#> system x86_64, linux-gnu
#> ui X11
#> language en_US:en
#> collate en_US.UTF-8
#> ctype en_US.UTF-8
#> tz America/New_York
#> date 2020-11-16
#>
#> ─ Packages ───────────────────────────────────────────────────────────────────
#> package * version date lib source
#> assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.0.0)
#> backports 1.2.0 2020-11-02 [1] CRAN (R 4.0.0)
#> callr 3.5.1 2020-10-13 [1] CRAN (R 4.0.0)
#> cli 2.1.0 2020-10-12 [1] CRAN (R 4.0.0)
#> crayon 1.3.4 2017-09-16 [1] CRAN (R 4.0.0)
#> desc 1.2.0 2018-05-01 [1] CRAN (R 4.0.0)
#> devtools 2.3.2 2020-09-18 [1] CRAN (R 4.0.0)
#> digest 0.6.27 2020-10-24 [1] CRAN (R 4.0.0)
#> dplyr * 1.0.2 2020-08-18 [1] CRAN (R 4.0.0)
#> ellipsis 0.3.1 2020-05-15 [1] CRAN (R 4.0.0)
#> evaluate 0.14 2019-05-28 [1] CRAN (R 4.0.0)
#> fansi 0.4.1 2020-01-08 [1] CRAN (R 4.0.0)
#> fs 1.5.0 2020-07-31 [1] CRAN (R 4.0.0)
#> generics 0.1.0 2020-10-31 [1] CRAN (R 4.0.0)
#> glue 1.4.2 2020-08-27 [1] CRAN (R 4.0.0)
#> highr 0.8 2019-03-20 [1] CRAN (R 4.0.0)
#> htmltools 0.5.0 2020-06-16 [1] CRAN (R 4.0.0)
#> knitr 1.30 2020-09-22 [1] CRAN (R 4.0.0)
#> lifecycle 0.2.0 2020-03-06 [1] CRAN (R 4.0.0)
#> magrittr 1.5 2014-11-22 [1] CRAN (R 4.0.0)
#> memoise 1.1.0 2017-04-21 [1] CRAN (R 4.0.0)
#> microbenchmark 1.4-7 2019-09-24 [1] CRAN (R 4.0.0)
#> pillar 1.4.6 2020-07-10 [1] CRAN (R 4.0.0)
#> pkgbuild 1.1.0 2020-07-13 [1] CRAN (R 4.0.0)
#> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.0.0)
#> pkgload 1.1.0 2020-05-29 [1] CRAN (R 4.0.0)
#> prettyunits 1.1.1 2020-01-24 [1] CRAN (R 4.0.0)
#> processx 3.4.4 2020-09-03 [1] CRAN (R 4.0.0)
#> ps 1.4.0 2020-10-07 [1] CRAN (R 4.0.0)
#> purrr 0.3.4 2020-04-17 [1] CRAN (R 4.0.0)
#> R6 2.5.0 2020-10-28 [1] CRAN (R 4.0.0)
#> remotes 2.2.0 2020-07-21 [1] CRAN (R 4.0.0)
#> rlang 0.4.8 2020-10-08 [1] CRAN (R 4.0.0)
#> rmarkdown 2.5 2020-10-21 [1] CRAN (R 4.0.0)
#> rprojroot 1.3-2 2018-01-03 [1] CRAN (R 4.0.0)
#> sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 4.0.0)
#> stringi 1.5.3 2020-09-09 [1] CRAN (R 4.0.0)
#> stringr 1.4.0 2019-02-10 [1] CRAN (R 4.0.0)
#> testthat 3.0.0 2020-10-31 [1] CRAN (R 4.0.0)
#> tibble 3.0.4 2020-10-12 [1] CRAN (R 4.0.0)
#> tidyselect 1.1.0 2020-05-11 [1] CRAN (R 4.0.0)
#> usethis 1.6.3 2020-09-17 [1] CRAN (R 4.0.0)
#> vctrs 0.3.4 2020-08-29 [1] CRAN (R 4.0.0)
#> withr 2.3.0 2020-09-22 [1] CRAN (R 4.0.0)
#> xfun 0.19 2020-10-30 [1] CRAN (R 4.0.0)
#> yaml 2.2.1 2020-02-01 [1] CRAN (R 4.0.0)
#>
#> [1] /software/R_libs/R400
#> [2] /software/R-4.0.0/lib/R/library