Skip to content

Instantly share code, notes, and snippets.

@jamescway
Created September 9, 2012 01:56
Show Gist options
  • Save jamescway/3682043 to your computer and use it in GitHub Desktop.
Save jamescway/3682043 to your computer and use it in GitHub Desktop.
bloomfilter
require 'digest'
require 'murmurhash3' #gem install murmurhash3
require 'trollop'
class BloomFilter
def initialize(size)
@size = size.to_i || 2_000_000
@num_inputs = 0
@bloom_filter = Array.new(size)
@bloom_filter.fill(0)
end
def hash_md5(str)
Integer('0x' + Digest::MD5.hexdigest(str))
end
def hash_murmur(str)
MurmurHash3::V32.str_hash(str)
end
def load_words(filename)
File.open(filename).each_with_index do |word, i|
#p "#{i} - #{@bloom_filter.to_s.count("1")}" if i % 10000 == 0
p "#{i}" if i % 10000 == 0
word.strip!
hash1 = hash_md5(word) % @size
hash2 = hash_murmur(word) % @size
@bloom_filter[hash1] = 1
@bloom_filter[hash2] = 1
@num_inputs = i if i > @num_inputs
end
end
def in_filter?(word)
hash1 = hash_md5(word) % @size
hash2 = hash_murmur(word) % @size
if @bloom_filter[hash1] == 1 && @bloom_filter[hash2] == 1
return true
end
false
end
def percent_false_positive
num_hashes = 2.0
((( 1-Math.exp( (-num_hashes * @num_inputs)/(@size) ) )**num_hashes)*100).round(6)
end
def check_dictionary(filename)
File.open(filename).each_with_index do |word, i|
if !in_filter?(word.strip!)
p "#{word}(#{i}) wasn't in filter!!"
break
end
end
end
end
opts = Trollop::options do
opt :size, "size", :default => 2_000_000 # integer --size <i>, default to 2_000_000
end
bf = BloomFilter.new(opts[:size])
puts "Filter Size: #{opts[:size]}"
bf.load_words("/usr/share/dict/words")
p "FALSE POSITIVES: #{bf.percent_false_positive}%"
bf.check_dictionary("/usr/share/dict/words")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment