Created
November 13, 2014 00:08
-
-
Save conorgriffin/da28ce998a0cd8135766 to your computer and use it in GitHub Desktop.
Performance testing of various methods to reduce an array of strings to an array of unique individual words
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'set' | |
# NOTE: This requires the benchmark-ips gem | |
# In case you don't want to install this gem, here's sample output of this file | |
# Calculating ------------------------------------- | |
# Old 1: 9 i/100ms | |
# New 1: 4 i/100ms | |
# New 2: 8 i/100ms | |
# New 3: 8 i/100ms | |
# New 4: 8 i/100ms | |
# New 5: 1 i/100ms | |
# ------------------------------------------------- | |
# Old 1: 98.1 (±3.1%) i/s - 495 in 5.051814s | |
# New 1: 47.4 (±10.6%) i/s - 236 in 5.017234s | |
# New 2: 78.7 (±8.9%) i/s - 392 in 5.030907s | |
# New 3: 87.5 (±4.6%) i/s - 440 in 5.042861s | |
# New 4: 84.3 (±5.9%) i/s - 424 in 5.049878s | |
# New 5: 6.7 (±0.0%) i/s - 34 in 5.067592s | |
# | |
# Comparison: | |
# Old 1:: 98.1 i/s | |
# New 3:: 87.5 i/s - 1.12x slower | |
# New 4:: 84.3 i/s - 1.16x slower | |
# New 2:: 78.7 i/s - 1.25x slower | |
# New 1:: 47.4 i/s - 2.07x slower | |
# New 5:: 6.7 i/s - 14.55x slower | |
# required gems | |
require 'benchmark/ips' | |
require 'test/unit' | |
class Array | |
# The original method we're looking to improve upon | |
# The method is intended to clean up an array of strings by separating each string | |
# into individual words and then generating a new Array consisting of only unique | |
# words. The idea is to improve upon the original method in performance and/or | |
# memory consumption | |
def clean_up_old | |
self.join(' ').split().uniq | |
end | |
# This one uses a Set as Sets don't allow duplicates, it removes the need for uniq | |
# on the output array. This one is slower than the original, probably because the | |
# merging of Sets for each element in the original Array takes more time than just | |
# calling uniq once on the output Array at the end. However, this should use less | |
# memory than the original as the Set will only contain unique words from the | |
# input array meaning it will be a subset of the input array. | |
def clean_up_new_1 | |
unique = Set.new | |
self.each { | |
|words| | |
unique.merge(words.split) | |
} | |
unique.to_a | |
end | |
# This one uses a Hash which also does not allow duplicates (keys) so I'm using it | |
# to store each unique word as a key and setting the value for that key to nil. | |
# This should use up less memory again when compared to the original but it's still | |
# slower | |
def clean_up_new_2 | |
unique = Hash.new | |
self.each { | |
|words| | |
words.split.each { | |
|word| | |
unique[word] = nil | |
} | |
} | |
unique.keys | |
end | |
# This one uses the Array method 'flat_map' which returns an Enumerator for the | |
# input Array's elements and splits each String entry, then returns only the | |
# unique entries from the result. This is close in performance to the original | |
# but in my opinion it is less readable although it may use less memory. | |
def clean_up_new_3 | |
self.flat_map(&:split).uniq | |
end | |
# This one moves through the original Array using inject to process | |
# each element containing space-separated words and appending them | |
# to a new array. Finally uniq is called to remove duplicate words | |
def clean_up_new_4 | |
self.inject([]) { | |
|array, item| | |
array.push(*item.split) | |
}.uniq | |
end | |
# This one uses inject to move through the Array, appending each string entry | |
# to the previous ones with a space in between, finally splitting into a new | |
# Array and using uniq to remove duplicates | |
def clean_up_new_5 | |
self.inject() { | |
|a, b| | |
a += ' ' + b | |
}.split.uniq | |
end | |
end | |
# Build an Array of random Strings made up of 1 - 6 words from the dictionary.txt | |
# file (80,000 words) | |
def build_input_array | |
array = Array.new | |
dictionary = File.readlines("dictionary.txt") | |
5000.times do | |
str = dictionary.sample.strip | |
word_count = rand(0..5) | |
word_count.times do | |
str += ' ' + dictionary.sample.strip | |
end | |
array << str | |
end | |
array | |
end | |
list = build_input_array | |
Benchmark.ips do |x| | |
x.report('Old 1:') do | |
list.clean_up_old | |
end | |
x.report('New 1:') do | |
list.clean_up_new_1 | |
end | |
x.report('New 2:') do | |
list.clean_up_new_2 | |
end | |
x.report('New 3:') do | |
list.clean_up_new_3 | |
end | |
x.report('New 4:') do | |
list.clean_up_new_4 | |
end | |
x.report('New 5:') do | |
list.clean_up_new_5 | |
end | |
x.compare! | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment