public
Last active

Is Set faster than Array? Not neccesarily, depending on size of collection and number of reads vs writes. (MRI) (anything you notice invalid or could be done better about the way these benchmarks are done? Let us know, and give us code and results for the better way :) )

  • Download Gist
gistfile1.txt
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
# test_for == N / 5 -- approximate array average case?
 
{:benchmark_iterations=>10000, :class=>Array, :test_for=>5, :num_rewrites=>1, :col_size=>10, :num_reads=>10}
0.140000 0.000000 0.140000 ( 0.141915)
 
{:benchmark_iterations=>10000, :class=>Set, :test_for=>5, :num_rewrites=>1, :col_size=>10, :num_reads=>10}
0.190000 0.010000 0.200000 ( 0.198503)
----
 
{:benchmark_iterations=>10000, :class=>Array, :test_for=>50, :num_rewrites=>1, :col_size=>100, :num_reads=>10}
0.890000 0.000000 0.890000 ( 0.899198)
 
{:benchmark_iterations=>10000, :class=>Set, :test_for=>50, :num_rewrites=>1, :col_size=>100, :num_reads=>10}
1.020000 0.010000 1.030000 ( 1.037214)
----
 
{:benchmark_iterations=>10000, :class=>Array, :test_for=>50, :num_rewrites=>1, :col_size=>100, :num_reads=>100}
5.020000 0.020000 5.040000 ( 5.060093)
 
{:benchmark_iterations=>10000, :class=>Set, :test_for=>50, :num_rewrites=>1, :col_size=>100, :num_reads=>100}
1.560000 0.010000 1.570000 ( 1.570679)
----
 
{:benchmark_iterations=>10000, :class=>Array, :test_for=>5, :num_rewrites=>1, :col_size=>10, :num_reads=>100}
0.780000 0.000000 0.780000 ( 0.783559)
 
{:benchmark_iterations=>10000, :class=>Set, :test_for=>5, :num_rewrites=>1, :col_size=>10, :num_reads=>100}
0.720000 0.000000 0.720000 ( 0.737738)
----
 
{:benchmark_iterations=>10000, :class=>Array, :test_for=>5, :num_rewrites=>1, :col_size=>10, :num_reads=>500}
3.670000 0.020000 3.690000 ( 3.714378)
 
{:benchmark_iterations=>10000, :class=>Set, :test_for=>5, :num_rewrites=>1, :col_size=>10, :num_reads=>500}
3.010000 0.020000 3.030000 ( 3.040910)
----
 
{:benchmark_iterations=>10000, :class=>Array, :test_for=>500, :num_rewrites=>1, :col_size=>1000, :num_reads=>10}
8.250000 0.040000 8.290000 ( 8.334810)
 
{:benchmark_iterations=>10000, :class=>Set, :test_for=>500, :num_rewrites=>1, :col_size=>1000, :num_reads=>10}
9.780000 0.030000 9.810000 ( 9.849330)
----
 
{:benchmark_iterations=>10000, :class=>Array, :test_for=>5000, :num_rewrites=>1, :col_size=>10000, :num_reads=>10}
90.600000 2.010000 92.610000 ( 95.435389)
 
{:benchmark_iterations=>10000, :class=>Set, :test_for=>5000, :num_rewrites=>1, :col_size=>10000, :num_reads=>10}
116.800000 0.870000 117.670000 (122.155812)
----
 
{:benchmark_iterations=>10000, :class=>Array, :test_for=>50, :num_rewrites=>5, :col_size=>100, :num_reads=>100}
8.660000 0.060000 8.720000 ( 8.975762)
 
{:benchmark_iterations=>10000, :class=>Set, :test_for=>50, :num_rewrites=>10, :col_size=>100, :num_reads=>100}
14.110000 0.120000 14.230000 ( 14.630167)
gistfile2.txt
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
## test_for == N + 10 , array worst case, when the test operand isn't in the collection at all?
 
----
 
{:benchmark_iterations=>10000, :class=>Array, :test_for=>20, :num_rewrites=>1, :col_size=>10, :num_reads=>10}
0.180000 0.000000 0.180000 ( 0.178402)
 
{:benchmark_iterations=>10000, :class=>Set, :test_for=>20, :num_rewrites=>1, :col_size=>10, :num_reads=>10}
0.190000 0.000000 0.190000 ( 0.196501)
----
 
{:benchmark_iterations=>10000, :class=>Array, :test_for=>110, :num_rewrites=>1, :col_size=>100, :num_reads=>10}
1.310000 0.010000 1.320000 ( 1.325232)
 
{:benchmark_iterations=>10000, :class=>Set, :test_for=>110, :num_rewrites=>1, :col_size=>100, :num_reads=>10}
1.020000 0.010000 1.030000 ( 1.034591)
----
 
{:benchmark_iterations=>10000, :class=>Array, :test_for=>110, :num_rewrites=>1, :col_size=>100, :num_reads=>100}
9.180000 0.040000 9.220000 ( 9.261914)
 
{:benchmark_iterations=>10000, :class=>Set, :test_for=>110, :num_rewrites=>1, :col_size=>100, :num_reads=>100}
1.560000 0.010000 1.570000 ( 1.576411)
----
 
{:benchmark_iterations=>10000, :class=>Array, :test_for=>20, :num_rewrites=>1, :col_size=>10, :num_reads=>100}
1.200000 0.000000 1.200000 ( 1.221110)
 
{:benchmark_iterations=>10000, :class=>Set, :test_for=>20, :num_rewrites=>1, :col_size=>10, :num_reads=>100}
0.730000 0.010000 0.740000 ( 0.736425)
----
 
{:benchmark_iterations=>10000, :class=>Array, :test_for=>20, :num_rewrites=>1, :col_size=>10, :num_reads=>500}
5.730000 0.020000 5.750000 ( 5.765911)
 
{:benchmark_iterations=>10000, :class=>Set, :test_for=>20, :num_rewrites=>1, :col_size=>10, :num_reads=>500}
3.000000 0.010000 3.010000 ( 3.078013)
----
 
{:benchmark_iterations=>10000, :class=>Array, :test_for=>1010, :num_rewrites=>1, :col_size=>1000, :num_reads=>10}
12.510000 0.060000 12.570000 ( 12.677678)
 
{:benchmark_iterations=>10000, :class=>Set, :test_for=>1010, :num_rewrites=>1, :col_size=>1000, :num_reads=>10}
9.790000 0.040000 9.830000 ( 9.870308)
----
 
{:benchmark_iterations=>10000, :class=>Array, :test_for=>10010, :num_rewrites=>1, :col_size=>10000, :num_reads=>10}
146.260000 2.430000 148.690000 (153.275032)
 
{:benchmark_iterations=>10000, :class=>Set, :test_for=>10010, :num_rewrites=>1, :col_size=>10000, :num_reads=>10}
124.660000 1.160000 125.820000 (132.589335)
----
 
{:benchmark_iterations=>10000, :class=>Array, :test_for=>110, :num_rewrites=>5, :col_size=>100, :num_reads=>100}
15.170000 0.190000 15.360000 ( 17.500672)
 
{:benchmark_iterations=>10000, :class=>Set, :test_for=>110, :num_rewrites=>10, :col_size=>100, :num_reads=>100}
13.460000 0.070000 13.530000 ( 13.704671)
report.markdown
Markdown

update it was pointed out a major bug in this test is it was always testing for 56, and since we were generating our collection array in order 1..n, 56 will be near the beginning of the array for large n's, and thus give array a near 'best case' performance. So I've run it again, with the test_for variable being calculated to be the mid-point (close to average case for array?), and another time with the test_for being a number that is not in the collection at all (array worse case).

In the 'array average case' approximation, Array still does better than Set in some cases that might surprise you, but not as much better, and not in as many cases.

In the 'array worst case', Array doesn't really ever do better than Set, and sometimes does really bad.

See the actual output below, decide for yourself based on this or any other data (and please feel free to add a comment explaining why this data may not be useful for real-world usage prediction, either here or on the reddit thread. I was just trying to explore something I was skeptical of empirically, instead of just arguing about it without measuring it, trying to measure it. If I messed it up, feel free to say so and why.)


Answer: If you have both a non-trivial number of reads and a non-trivial collection size, Set is faster. But even a huge collection with just a few reads, Array is generally faster. And a 10 element collection needs more than 100 reads for Set to take the lead.

'reads' are check for include? According to my code below (am I in error somewhere?):

  • For a collection of size 10, with only 10 reads, Array is (slightly) faster.

  • For a collection of size 100 with only 10 reads, Array is still (slightly) faster.

  • For a collection size of 100, with 100 reads.... Set is 5x faster.

  • For a collection size of 10 with 100 reads... Array is still very slightly faster.

  • For a collection size of 10 with 500 reads.... Set is faster again, but not by a huge amount.

  • For a collection size of 1000 with 10 reads.... Array is still 2x faster!

  • For a collection size of 10000 with 10 reads... Array somehow about 3x faster?!

Okay, but what if we do lots of writes too?

  • For a collection size of 100, with 100 reads, but where we rewrite the contents 5 times (500 writes, 100 5x)... Set is still faster, but by less than 2x now.

  • For a collection size of 100 with 100 reads but where we rewrite the contents 10 times... Array is slightly faster again.

Note, ALL these 'faster than' measurements come from repeating the whole simulation 10k times, in actual real world app any degree of 'faster' either way may or may not be significant enough to matter. beware the micro-benchmark.

results.rb
Ruby
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
# update, this is close to 'best case' performance for the Array in many of the scenarios
 
----
 
{:test_for=>56, :class=>Array, :benchmark_iterations=>10000, :num_rewrites=>1, :col_size=>10, :num_reads=>10}
0.180000 0.000000 0.180000 ( 0.177469)
 
{:test_for=>56, :class=>Set, :benchmark_iterations=>10000, :num_rewrites=>1, :col_size=>10, :num_reads=>10}
0.190000 0.000000 0.190000 ( 0.195521)
----
 
{:test_for=>56, :class=>Array, :benchmark_iterations=>10000, :num_rewrites=>1, :col_size=>100, :num_reads=>10}
0.930000 0.010000 0.940000 ( 0.944080)
 
{:test_for=>56, :class=>Set, :benchmark_iterations=>10000, :num_rewrites=>1, :col_size=>100, :num_reads=>10}
1.020000 0.010000 1.030000 ( 1.034966)
----
 
{:test_for=>56, :class=>Array, :benchmark_iterations=>10000, :num_rewrites=>1, :col_size=>100, :num_reads=>100}
5.810000 0.030000 5.840000 ( 5.945467)
 
{:test_for=>56, :class=>Set, :benchmark_iterations=>10000, :num_rewrites=>1, :col_size=>100, :num_reads=>100}
1.780000 0.020000 1.800000 ( 1.906391)
----
 
{:test_for=>56, :class=>Array, :benchmark_iterations=>10000, :num_rewrites=>10, :col_size=>10, :num_reads=>100}
1.620000 0.010000 1.630000 ( 1.637445)
 
{:test_for=>56, :class=>Set, :benchmark_iterations=>10000, :num_rewrites=>10, :col_size=>10, :num_reads=>100}
1.730000 0.010000 1.740000 ( 1.747314)
----
 
{:test_for=>56, :class=>Array, :benchmark_iterations=>10000, :num_rewrites=>10, :col_size=>10, :num_reads=>500}
6.180000 0.050000 6.230000 ( 6.349723)
 
{:test_for=>56, :class=>Set, :benchmark_iterations=>10000, :num_rewrites=>10, :col_size=>10, :num_reads=>500}
4.050000 0.040000 4.090000 ( 4.200705)
---
 
{:test_for=>56, :class=>Array, :benchmark_iterations=>10000, :num_rewrites=>1, :col_size=>1000, :num_reads=>10}
4.950000 0.040000 4.990000 ( 5.204897)
 
{:test_for=>56, :class=>Set, :benchmark_iterations=>10000, :num_rewrites=>1, :col_size=>1000, :num_reads=>10}
10.340000 0.060000 10.400000 ( 10.542767)
----
 
{:test_for=>56, :class=>Array, :benchmark_iterations=>10000, :num_rewrites=>1, :col_size=>10000, :num_reads=>10}
40.520000 1.620000 42.140000 ( 45.072684)
 
{:test_for=>56, :class=>Set, :benchmark_iterations=>10000, :num_rewrites=>1, :col_size=>10000, :num_reads=>10}
120.250000 0.820000 121.070000 (124.624476)
 
----
 
{:test_for=>56, :class=>Array, :benchmark_iterations=>10000, :num_rewrites=>5, :col_size=>100, :num_reads=>100}
7.130000 0.040000 7.170000 ( 7.440953)
 
{:test_for=>56, :class=>Set, :benchmark_iterations=>10000, :num_rewrites=>5, :col_size=>100, :num_reads=>100}
5.750000 0.050000 5.800000 ( 6.090684)
----
 
{:test_for=>56, :class=>Array, :benchmark_iterations=>10000, :num_rewrites=>10, :col_size=>100, :num_reads=>100}
9.080000 0.050000 9.130000 ( 9.190300)
 
{:test_for=>56, :class=>Set, :benchmark_iterations=>10000, :num_rewrites=>10, :col_size=>100, :num_reads=>100}
10.920000 0.050000 10.970000 ( 11.042603)
source.rb
Ruby
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
#!/usr/bin/ruby
 
require 'set'
require 'benchmark'
 
 
def simulate(klass, options)
 
num_rewrites = options[:num_rewrites] ||= 1
coll_size = options[:col_size] ||= 10
num_reads = options[:num_reads] ||= 10
test_for = options[:test_for] ||= 56
benchmark_iterations = options[:benchmark_iterations] ||= 50000
options[:class] = klass
 
puts "\n#{options.inspect}"
puts( Benchmark.measure do
benchmark_iterations.times do
coll = klass.new
 
num_rewrites.times do |rewrite_n|
coll.clear if rewrite_n > 0
 
coll_size.times do |n|
coll << n
end
end
 
num_reads.times do
coll.include? test_for
end
end
end)
end
 
 
 
puts "----"
simulate(Array, :col_size => 10, :num_rewrites => 1, :num_reads => 10)
simulate(Set, :col_size => 10, :num_rewrites => 1, :num_reads => 10)
 
puts "----"
simulate(Array, :col_size => 100, :num_rewrites => 1, :num_reads => 10)
simulate(Set, :col_size => 100, :num_rewrites => 1, :num_reads => 10)
 
puts "----"
simulate(Array, :col_size => 100, :num_rewrites => 1, :num_reads => 1000)
simulate(Set, :col_size => 100, :num_rewrites => 1, :num_reads => 1000)
 
puts "---"
simulate(Array, :col_size => 1000, :num_rewrites => 1, :num_reads => 10)
simulate(Set, :col_size => 1000, :num_rewrites => 1, :num_reads => 10)
 
# etc

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.