Skip to content

Instantly share code, notes, and snippets.

@SigurdJanson
Created October 30, 2022 11:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save SigurdJanson/3e87c7cfda032d313ab04748556ac911 to your computer and use it in GitHub Desktop.
Save SigurdJanson/3e87c7cfda032d313ab04748556ac911 to your computer and use it in GitHub Desktop.
Performance benchmark of various solutions comparing two vectors
  • setequal outperforms other approaches when we can expected that vectors have the same length.
  • If vectors vary in length or can be NULL, using length and intersect (is_set_same6()) is superior to all others, even setequal.
# testing speed of finding identical vectors
library(compare)
#> 
#> Attache Paket: 'compare'
#> Das folgende Objekt ist maskiert 'package:base':
#> 
#>     isTRUE
library(bench)

is_set_same1 <- function (a, b) {
  l <-
    Map(table, list(a, b)) # Compute frequencies - returns ordered table
  Reduce(identical, l) # Check if frequencies are the same for all input vectors
}
is_set_same4 <- function(a, b) {
  compare(a, b, allowAll = TRUE, shorten = FALSE)$result
}
is_set_same5 <- function(a, b) {
  return(identical(sort(a), sort(b)))
}
is_set_same6 <- function(a, b) {
  length(a) == length(b) && length(a) == length(intersect(a, b))
}
is_set_same7 <- function(a, b) {
  length(a) == length(b) && setequal(a, b)
}

Case:Total overlap, different order

a <- 1:1000
b <- 1000:1
bench::mark(
  setequal(a, b),
  all(a %in% b) & all(b%in%a),
  `Map/Reduce` = is_set_same1(a, b),
  compare = is_set_same4(a, b),
  sort = is_set_same5(a, b),
  `length/intersect` = is_set_same6(a, b),
  `length/set` = is_set_same7(a, b)
)
#> # A tibble: 7 × 6
#>   expression                         min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>                    <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 setequal(a, b)                  17.2µs     22µs    29322.   46.54KB    17.6 
#> 2 all(a %in% b) & all(b %in% a)   20.1µs     24µs    29739.   47.72KB    23.8 
#> 3 Map/Reduce                     872.6µs  897.2µs     1065.  489.36KB     6.21
#> 4 compare                        342.6µs  353.6µs     2659.  532.31KB    14.6 
#> 5 sort                            41.6µs   43.5µs    21847.    7.91KB    17.5 
#> 6 length/intersect                23.9µs   26.6µs    29182.   51.54KB    26.3 
#> 7 length/set                      17.9µs   20.1µs    39108.   39.81KB    27.4

Case: No overlap

b <- 2000:1001
bench::mark(
  setequal(a, b),
  all(a %in% b) & all(b%in%a),
  `Map/Reduce` = is_set_same1(a, b),
  compare = is_set_same4(a, b),
  sort = is_set_same5(a, b),
  `length/intersect` = is_set_same6(a, b),
  `length/set` = is_set_same7(a, b)
)
#> # A tibble: 7 × 6
#>   expression                         min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>                    <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 setequal(a, b)                   8.5µs    9.2µs    87375.   19.91KB    35.0 
#> 2 all(a %in% b) & all(b %in% a)   17.6µs   21.1µs    36339.   47.72KB    32.7 
#> 3 Map/Reduce                      1.09ms    1.1ms      888.  332.19KB     4.07
#> 4 compare                        775.7µs  814.8µs     1163.  314.73KB    17.4 
#> 5 sort                            40.9µs   43.4µs    22249.    7.91KB    17.9 
#> 6 length/intersect                18.9µs   21.1µs    38712.   39.68KB    27.1 
#> 7 length/set                       9.2µs   10.4µs    72695.   19.91KB    29.1

Case: Overlap 50%

b <- c(sample(a, length(a)/2), sample(2000:1001, length(a)/2))
bench::mark(
  setequal(a, b),
  all(a %in% b) & all(b%in%a),
  `Map/Reduce` = is_set_same1(a, b),
  compare = is_set_same4(a, b),
  sort = is_set_same5(a, b),
  `length/intersect` = is_set_same6(a, b),
  `length/set` = is_set_same7(a, b)
)
#> # A tibble: 7 × 6
#>   expression                         min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>                    <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 setequal(a, b)                     9µs   10.8µs    74634.   19.91KB    29.9 
#> 2 all(a %in% b) & all(b %in% a)   19.1µs     23µs    32591.   47.72KB    29.4 
#> 3 Map/Reduce                      1.02ms   1.04ms      915.  340.23KB     4.07
#> 4 compare                        844.4µs  863.6µs     1139.  275.88KB    17.4 
#> 5 sort                            68.7µs   71.1µs    13773.    7.91KB    10.2 
#> 6 length/intersect                23.5µs   26.6µs    30429.   45.68KB    24.4 
#> 7 length/set                       9.8µs   11.8µs    70363.   19.91KB    28.2

Case: Contains NA

b[sample.int(1:length(b), size = 10)] <- NA
#> Error in sample.int(1:length(b), size = 10): length(n) == 1L ist nicht TRUE
bench::mark(
  setequal(a, b),
  all(a %in% b) & all(b%in%a),
  `Map/Reduce` = is_set_same1(a, b),
  compare = is_set_same4(a, b),
  sort = is_set_same5(a, b),
  `length/intersect` = is_set_same6(a, b),
  `length/set` = is_set_same7(a, b)
)
#> # A tibble: 7 × 6
#>   expression                         min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>                    <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 setequal(a, b)                  14.1µs   15.8µs    54374.   19.91KB    21.8 
#> 2 all(a %in% b) & all(b %in% a)     26µs   29.9µs    26928.   47.72KB    24.3 
#> 3 Map/Reduce                      1.03ms   1.04ms      936.  340.23KB     6.13
#> 4 compare                        839.2µs  858.9µs     1144.  279.83KB    14.7 
#> 5 sort                            67.5µs   71.1µs    13642.    7.91KB    12.4 
#> 6 length/intersect                  29µs   31.7µs    26518.   45.68KB    21.2 
#> 7 length/set                      14.9µs     16µs    55129.   19.91KB    22.1

Case: length difference by 1, same elements except first

b <- a[-1]
bench::mark(
  setequal(a, b),
  all(a %in% b) & all(b%in%a),
  `Map/Reduce` = is_set_same1(a, b),
  compare = is_set_same4(a, b),
  sort = is_set_same5(a, b),
  `length/intersect` = is_set_same6(a, b),
  `length/set` = is_set_same7(a, b)
)
#> # A tibble: 7 × 6
#>   expression                         min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>                    <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 setequal(a, b)                   9.1µs    9.8µs    85055.    19.9KB    34.0 
#> 2 all(a %in% b) & all(b %in% a)   19.4µs   21.6µs    35888.    47.7KB    32.3 
#> 3 Map/Reduce                     842.7µs  857.8µs     1112.   340.2KB     8.56
#> 4 compare                        503.2µs  516.1µs     1894.        0B    19.5 
#> 5 sort                            10.8µs   11.3µs    84267.        0B    16.9 
#> 6 length/intersect                 600ns    700ns  1351563.        0B   135.  
#> 7 length/set                       500ns    600ns  1561524.        0B     0

Case: One vector is NULL

b <- NULL
bench::mark(
  setequal(a, b),
  all(a %in% b) & all(b%in%a),
  `Map/Reduce` = is_set_same1(a, b),
  compare = is_set_same4(a, b),
  sort = is_set_same5(a, b),
  `length/intersect` = is_set_same6(a, b),
  `length/set` = is_set_same7(a, b)
)
#> # A tibble: 7 × 6
#>   expression                         min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>                    <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 setequal(a, b)                   1.9µs    2.1µs   425953.    3.95KB    42.6 
#> 2 all(a %in% b) & all(b %in% a)    3.1µs    3.6µs   170060.    7.91KB    17.0 
#> 3 Map/Reduce                     500.4µs  515.4µs     1865.  166.09KB     6.20
#> 4 compare                        589.5µs  604.5µs     1634.    4.21KB    21.2 
#> 5 sort                            20.9µs   21.8µs    44736.        0B    22.4 
#> 6 length/intersect                 600ns    700ns  1327545.        0B     0   
#> 7 length/set                       500ns    600ns  1565876.        0B     0

Created on 2022-10-30 with reprex v2.0.2

See also "Check whether two vectors contain the same (unordered) elements in R" on StackOverflow

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment