setequal
outperforms other approaches when we can expected that vectors have the same length.- If vectors vary in length or can be NULL, using
length
andintersect
(is_set_same6()
) is superior to all others, evensetequal
.
# testing speed of finding identical vectors
library(compare)
#>
#> Attache Paket: 'compare'
#> Das folgende Objekt ist maskiert 'package:base':
#>
#> isTRUE
library(bench)
is_set_same1 <- function (a, b) {
l <-
Map(table, list(a, b)) # Compute frequencies - returns ordered table
Reduce(identical, l) # Check if frequencies are the same for all input vectors
}
is_set_same4 <- function(a, b) {
compare(a, b, allowAll = TRUE, shorten = FALSE)$result
}
is_set_same5 <- function(a, b) {
return(identical(sort(a), sort(b)))
}
is_set_same6 <- function(a, b) {
length(a) == length(b) && length(a) == length(intersect(a, b))
}
is_set_same7 <- function(a, b) {
length(a) == length(b) && setequal(a, b)
}
Case:Total overlap, different order
a <- 1:1000
b <- 1000:1
bench::mark(
setequal(a, b),
all(a %in% b) & all(b%in%a),
`Map/Reduce` = is_set_same1(a, b),
compare = is_set_same4(a, b),
sort = is_set_same5(a, b),
`length/intersect` = is_set_same6(a, b),
`length/set` = is_set_same7(a, b)
)
#> # A tibble: 7 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 setequal(a, b) 17.2µs 22µs 29322. 46.54KB 17.6
#> 2 all(a %in% b) & all(b %in% a) 20.1µs 24µs 29739. 47.72KB 23.8
#> 3 Map/Reduce 872.6µs 897.2µs 1065. 489.36KB 6.21
#> 4 compare 342.6µs 353.6µs 2659. 532.31KB 14.6
#> 5 sort 41.6µs 43.5µs 21847. 7.91KB 17.5
#> 6 length/intersect 23.9µs 26.6µs 29182. 51.54KB 26.3
#> 7 length/set 17.9µs 20.1µs 39108. 39.81KB 27.4
Case: No overlap
b <- 2000:1001
bench::mark(
setequal(a, b),
all(a %in% b) & all(b%in%a),
`Map/Reduce` = is_set_same1(a, b),
compare = is_set_same4(a, b),
sort = is_set_same5(a, b),
`length/intersect` = is_set_same6(a, b),
`length/set` = is_set_same7(a, b)
)
#> # A tibble: 7 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 setequal(a, b) 8.5µs 9.2µs 87375. 19.91KB 35.0
#> 2 all(a %in% b) & all(b %in% a) 17.6µs 21.1µs 36339. 47.72KB 32.7
#> 3 Map/Reduce 1.09ms 1.1ms 888. 332.19KB 4.07
#> 4 compare 775.7µs 814.8µs 1163. 314.73KB 17.4
#> 5 sort 40.9µs 43.4µs 22249. 7.91KB 17.9
#> 6 length/intersect 18.9µs 21.1µs 38712. 39.68KB 27.1
#> 7 length/set 9.2µs 10.4µs 72695. 19.91KB 29.1
Case: Overlap 50%
b <- c(sample(a, length(a)/2), sample(2000:1001, length(a)/2))
bench::mark(
setequal(a, b),
all(a %in% b) & all(b%in%a),
`Map/Reduce` = is_set_same1(a, b),
compare = is_set_same4(a, b),
sort = is_set_same5(a, b),
`length/intersect` = is_set_same6(a, b),
`length/set` = is_set_same7(a, b)
)
#> # A tibble: 7 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 setequal(a, b) 9µs 10.8µs 74634. 19.91KB 29.9
#> 2 all(a %in% b) & all(b %in% a) 19.1µs 23µs 32591. 47.72KB 29.4
#> 3 Map/Reduce 1.02ms 1.04ms 915. 340.23KB 4.07
#> 4 compare 844.4µs 863.6µs 1139. 275.88KB 17.4
#> 5 sort 68.7µs 71.1µs 13773. 7.91KB 10.2
#> 6 length/intersect 23.5µs 26.6µs 30429. 45.68KB 24.4
#> 7 length/set 9.8µs 11.8µs 70363. 19.91KB 28.2
Case: Contains NA
b[sample.int(1:length(b), size = 10)] <- NA
#> Error in sample.int(1:length(b), size = 10): length(n) == 1L ist nicht TRUE
bench::mark(
setequal(a, b),
all(a %in% b) & all(b%in%a),
`Map/Reduce` = is_set_same1(a, b),
compare = is_set_same4(a, b),
sort = is_set_same5(a, b),
`length/intersect` = is_set_same6(a, b),
`length/set` = is_set_same7(a, b)
)
#> # A tibble: 7 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 setequal(a, b) 14.1µs 15.8µs 54374. 19.91KB 21.8
#> 2 all(a %in% b) & all(b %in% a) 26µs 29.9µs 26928. 47.72KB 24.3
#> 3 Map/Reduce 1.03ms 1.04ms 936. 340.23KB 6.13
#> 4 compare 839.2µs 858.9µs 1144. 279.83KB 14.7
#> 5 sort 67.5µs 71.1µs 13642. 7.91KB 12.4
#> 6 length/intersect 29µs 31.7µs 26518. 45.68KB 21.2
#> 7 length/set 14.9µs 16µs 55129. 19.91KB 22.1
Case: length difference by 1, same elements except first
b <- a[-1]
bench::mark(
setequal(a, b),
all(a %in% b) & all(b%in%a),
`Map/Reduce` = is_set_same1(a, b),
compare = is_set_same4(a, b),
sort = is_set_same5(a, b),
`length/intersect` = is_set_same6(a, b),
`length/set` = is_set_same7(a, b)
)
#> # A tibble: 7 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 setequal(a, b) 9.1µs 9.8µs 85055. 19.9KB 34.0
#> 2 all(a %in% b) & all(b %in% a) 19.4µs 21.6µs 35888. 47.7KB 32.3
#> 3 Map/Reduce 842.7µs 857.8µs 1112. 340.2KB 8.56
#> 4 compare 503.2µs 516.1µs 1894. 0B 19.5
#> 5 sort 10.8µs 11.3µs 84267. 0B 16.9
#> 6 length/intersect 600ns 700ns 1351563. 0B 135.
#> 7 length/set 500ns 600ns 1561524. 0B 0
Case: One vector is NULL
b <- NULL
bench::mark(
setequal(a, b),
all(a %in% b) & all(b%in%a),
`Map/Reduce` = is_set_same1(a, b),
compare = is_set_same4(a, b),
sort = is_set_same5(a, b),
`length/intersect` = is_set_same6(a, b),
`length/set` = is_set_same7(a, b)
)
#> # A tibble: 7 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 setequal(a, b) 1.9µs 2.1µs 425953. 3.95KB 42.6
#> 2 all(a %in% b) & all(b %in% a) 3.1µs 3.6µs 170060. 7.91KB 17.0
#> 3 Map/Reduce 500.4µs 515.4µs 1865. 166.09KB 6.20
#> 4 compare 589.5µs 604.5µs 1634. 4.21KB 21.2
#> 5 sort 20.9µs 21.8µs 44736. 0B 22.4
#> 6 length/intersect 600ns 700ns 1327545. 0B 0
#> 7 length/set 500ns 600ns 1565876. 0B 0
Created on 2022-10-30 with reprex v2.0.2
See also "Check whether two vectors contain the same (unordered) elements in R" on StackOverflow