Skip to content

Instantly share code, notes, and snippets.

@toby
Created October 29, 2014 17:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save toby/02faa64b286ac4cceca0 to your computer and use it in GitHub Desktop.
Save toby/02faa64b286ac4cceca0 to your computer and use it in GitHub Desktop.
(ns backrack.bloom
(:import [java.lang Math])
(:use [digest :only [sha-256]]))
(deftype Bloom [m k f]
Object
(equals [this b2]
(and (= (.m this) (.m b2)) (= (.k this) (.k b2)) (= (.f this) (.f b2))))
(toString [this]
(str "m: " (.m this) " k: " (.k this))))
(defprotocol BloomOps
(add-item [bloom item])
(remove-item [bloom item])
(added? [bloom item])
(blank? [bloom])
(full? [bloom]))
(defn- log-base [n base]
(/ (Math/log n) (Math/log base)))
(defn- index-bytes [m]
(int (Math/ceil (/ (Math/floor (log-base m 2)) 8.0))))
(defn- forever-sha-256
([item]
(forever-sha-256 item 0))
([item modifier]
(if (= modifier 0)
(lazy-cat (sha-256 item) (forever-sha-256 item (inc modifier)))
(lazy-cat (sha-256 (str item modifier)) (forever-sha-256 item (inc modifier))))))
(defn- hash-indexes [m k item]
(take k (map #(mod (Integer/parseInt (apply str %) 16) m)
(partition (bit-shift-left (index-bytes m) 1) (forever-sha-256 item)))))
(defn- has-indexes? [f indexes]
(every? pos? (map f indexes)))
(defn- update-bloom [m k f indexes function]
(if (empty? indexes)
(Bloom. m k f)
(recur m k (update-in f [(first indexes)] function) (rest indexes) function)))
(extend-type Bloom
BloomOps
(add-item [this item]
(let [indexes (hash-indexes (.m this) (.k this) item)]
(if-not (has-indexes? (.f this) indexes)
(update-bloom (.m this) (.k this) (.f this) indexes inc)
this)))
(remove-item [this item]
(let [indexes (hash-indexes (.m this) (.k this) item)]
(if (has-indexes? (.f this) indexes)
(update-bloom (.m this) (.k this) (.f this) indexes dec)
this)))
(added? [this item]
(has-indexes? (.f this) (hash-indexes (.m this) (.k this) item)))
(blank? [this]
(every? zero? (.f this)))
(full? [this]
(every? pos? (.f this))))
(def blank-filter
(fn [m] (apply conj (vector-of :short) (repeat m (short 0)))))
(defn create-bloom
([]
(create-bloom 5000 0.001))
([capacity error-rate]
(let [m (long (Math/ceil (/ (* capacity (Math/log error-rate))
(Math/log (/ 1 (Math/pow 2 (Math/log 2)))))))
k (long (Math/floor (* (Math/log 2) (/ m capacity))))]
(Bloom. m k (blank-filter m)))))
(defn encode [bloom]
(let [parts (partition-by identity (.f bloom))]
(hash-map :m (.m bloom) :k (.k bloom) :filter (map (juxt first count) parts))))
(defn decode [encoded]
(let [f (apply vector-of :short (mapcat (fn [[value length]] (repeat length (short value))) (:filter encoded)))]
(Bloom. (:m encoded) (:k encoded) f)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment