Skip to content

Instantly share code, notes, and snippets.

@snej
Created February 2, 2009 05:04
Show Gist options
  • Save snej/56785 to your computer and use it in GitHub Desktop.
Save snej/56785 to your computer and use it in GitHub Desktop.
Implements a Bloom filter in the Scala language.
import scala.collection.BitSet
import scala.collection.mutable
import scala.runtime.RichString
/** Abstract trait for a Bloom filter, a data structure similar to a set, but which requires
far less memory (only a few bits per item). The trade-off is that you get false positives:
the filter may think it contains items that were never added to it. But for some
applications (especially distributed ones) the memory savings are worth it.
See: <http://en.wikipedia.org/wiki/Bloom_filter> */
@cloneable
trait AbsBloomFilter[T] {
/** @return True if the object could have been added to the filter, else false */
def couldContain (elem: T) :Boolean = hashes.forall( hash => _bits(hash(elem).abs % size) )
def apply (elem :T) = couldContain(elem)
/** The underlying BitSet storing the data. This can be stored or transmitted. */
val bits :BitSet = _bits
override def toString = this.getClass().getName() + "[" +
(0 until size).map( i => if (_bits(i)) '1' else '0' ).mkString + "]"
protected type BitsType <: BitSet
protected var _bits :BitsType
protected val hashes :List[T => Int]
protected val size = _bits.capacity
}
/** A concrete immutable Bloom filter.
@param hashes A list of hash functions that are used to set and test the bits.
@param b The bits storing the filter data. */
class BloomFilter[T] (val hashes :List[T => Int], val b :BitSet)
extends {var _bits = b}
with AbsBloomFilter[T] {
override def clone() = this
protected type BitsType = BitSet
}
/** A concrete mutable Bloom filter.
@param hashes A list of hash functions that are used to set and test the bits.
@param b The bits storing the filter data. */
class MutableBloomFilter[T] (val hashes :List[T => Int], val b :mutable.BitSet)
extends {var _bits = b.clone()}
with AbsBloomFilter[T] {
/** Creates a new empty Bloom filter.
@param hashes A list of hash functions that are used to set and test the bits.
@param s The size in bits of the BitSet to use. Larger values take more memory
(of course) but produce fewer false positives. */
def this(hashes :List[T => Int], s :Int) = this(hashes, new mutable.BitSet(s))
/** Adds an object to the Bloom filter. */
def += (elem :T) = hashes.foreach( hash => _bits += (hash(elem).abs % size) )
override def clone = new MutableBloomFilter(hashes,_bits)
protected type BitsType = mutable.BitSet
}
/** A useful group of hash functions for use with Bloom filters.
Implementations are in Arash Partow's General Hash Function library
<http://www.partow.net/programming/hashfunctions> */
object HashFunctions {
def RS (str :String) = GeneralHashFunctionLibrary.RSHash(str).asInstanceOf[Int]
def PJW (str :String) = GeneralHashFunctionLibrary.PJWHash(str).asInstanceOf[Int]
def JS (str :String) = GeneralHashFunctionLibrary.JSHash(str).asInstanceOf[Int]
def ELF (str :String) = GeneralHashFunctionLibrary.ELFHash(str).asInstanceOf[Int]
def BKDR(str :String) = GeneralHashFunctionLibrary.BKDRHash(str).asInstanceOf[Int]
def SDBM(str :String) = GeneralHashFunctionLibrary.SDBMHash(str).asInstanceOf[Int]
def DJB (str :String) = GeneralHashFunctionLibrary.DJBHash(str).asInstanceOf[Int]
def DEK (str :String) = GeneralHashFunctionLibrary.DEKHash(str).asInstanceOf[Int]
def BP (str :String) = GeneralHashFunctionLibrary.BPHash(str).asInstanceOf[Int]
def FNV (str :String) = GeneralHashFunctionLibrary.FNVHash(str).asInstanceOf[Int]
def AP (str :String) = GeneralHashFunctionLibrary.APHash(str).asInstanceOf[Int]
val all : List[String=>Int] = List(RS,PJW,JS,ELF,BKDR,SDBM,DJB,DEK,BP,FNV,AP)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment