Skip to content

Instantly share code, notes, and snippets.

@bradclawsie
Created June 30, 2012 07:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bradclawsie/3022745 to your computer and use it in GitHub Desktop.
Save bradclawsie/3022745 to your computer and use it in GitHub Desktop.
sha1_bloom_filter.hs
module Main where
import qualified Crypto.Hash.SHA1 as SHA1
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import qualified Data.ByteString as B
import qualified Text.Printf as P
import qualified Data.Word as W
import qualified Data.List.Split as S
import qualified Foreign as F
import qualified System.IO.Unsafe as UNSAFE
import qualified Control.Monad.Primitive as P
import qualified Data.Vector.Unboxed as U
import qualified Data.Vector.Unboxed.Mutable as UM
import qualified Control.Monad as C
import qualified Data.ByteString.UTF8 as UTF8
import qualified Data.Vector as V
-- get a printable representation of the hash
-- borrowed from:
-- http://stackoverflow.com/questions/9502777/sha1-encoding-in-haskell
-- use as (putStrLn . toHex) h
-- for some hash h
toHex :: B.ByteString -> String
toHex bytes = B.unpack bytes >>= P.printf "%02x"
-- simpe alias
word32ToInt :: F.Word32 -> Int
word32ToInt = fromIntegral
-- borrowed from:
-- http://hackage.haskell.org/packages/archive/random-source/latest/doc/html/src/Data-Random-Internal-Words.html#buildWord32
buildWord32 :: W.Word8 -> W.Word8 -> W.Word8 -> W.Word8 -> W.Word32
buildWord32 b0 b1 b2 b3
= UNSAFE.unsafePerformIO . F.allocaBytes 4 $ \p -> do
F.pokeByteOff p 0 b0
F.pokeByteOff p 1 b1
F.pokeByteOff p 2 b2
F.pokeByteOff p 3 b3
F.peek (F.castPtr p)
-- transform the sha1 hash bytes into five Ints
-- numOffsets is the number of 32bitInt chunks to use. since there
-- are only five in total, any number provided here larger than five
-- will result in all
chunksWord32Int :: B.ByteString -> Int -> [Int]
chunksWord32Int h numOffsets =
-- the sha1 hash is composed of twenty Word8s (160 bits).
-- we split group them by four (making five groups). four since 4*8 = 32,
-- and we want 32bit Ints
let word8sby4 = S.splitEvery 4 $ B.unpack h in
-- make each into a Word32, which we then coerce into its Int represention
take numOffsets $ map toWord32Int word8sby4
-- make this is private function since we are not checking to see if the
-- list of word8s is of length four (we can assume so here, but generally
-- this would make the function brittle were it top-level)
where
toWord32Int word8s =
word32ToInt $ buildWord32 (word8s!!0) (word8s!!1) (word8s!!2) (word8s!!3)
-- transform the sha1 hash into a list of numOffsets offsets into the filter
filterOffsets :: B.ByteString -> Int -> Int -> [Int]
filterOffsets h n numOffsets =
-- given the sha1 hash, break it into five Word32Ints
-- and then get the modulo offsets in the bloom filter
map (`mod` n) $ chunksWord32Int h numOffsets
-- construct an initially empty (all False) bloom filter of length n
initialFilter :: P.PrimMonad m => Int -> m (UM.MVector (P.PrimState m) Bool)
initialFilter n = UM.replicate n False
-- build the sha1 hash for the text
toSHA1 :: T.Text -> B.ByteString
toSHA1 t = SHA1.hash $ TE.encodeUtf8 t
-- given a Text input, compute the sha1 hash, get the first numOffsets
-- filter offsets for the constituent Word32Ints, and write the
-- bloom filter to True at those offsets.
-- we know that all offsets are within the bounds of the vector given the
-- modulus calculation in filterOffsets
writeFilter :: P.PrimMonad m => UM.MVector (P.PrimState m) Bool -> Int -> T.Text -> m ()
writeFilter bf numOffsets t = let h = toSHA1 t
l = UM.length bf
offsets = filterOffsets h l numOffsets in
C.mapM_ (\i -> UM.write bf i True) offsets
-- given a Text input, compute the sha1 hash, get the first numOffsets
-- filter offsets for the constituent Word32Ints, and read the
-- bloom filter values at those offsets.
-- we know that all offsets are within the bounds of the vector given the
-- modulus calculation in filterOffsets
readFilter :: (P.PrimMonad m, UM.Unbox b) => UM.MVector (P.PrimState m) b -> Int -> T.Text -> m [b]
readFilter bf numOffsets t = let h = toSHA1 t
l = UM.length bf
offsets = filterOffsets h l numOffsets in
C.mapM (\i -> UM.read bf i) offsets
-- alias for the test for membership, apply to results of readFilter
inFilter :: [Bool] -> Bool
inFilter = all (== True)
-- testing function
tryFilter :: P.PrimMonad m => UM.MVector (P.PrimState m) Bool -> Int -> UTF8.ByteString -> m (T.Text, Bool)
tryFilter bf numOffsets bs = let t = TE.decodeUtf8 bs in
do
rf <- readFilter bf numOffsets t
case inFilter rf of
True -> return (t,False)
_ -> writeFilter bf numOffsets t >> return (t,True)
main :: IO ()
main = do
words <- C.liftM (V.fromList . UTF8.lines) $ B.readFile "/etc/dictionaries-common/words"
let n = 3000000
let numOffsets = 5 :: Int -- numOffsets == number of hashes to use (1..5)
bf <- initialFilter n
rs <- C.mapM (tryFilter bf numOffsets) (V.toList words)
let collisions = filter (\i -> (snd i) == False) rs
putStrLn $ show collisions
fa <- readFilter bf numOffsets $ T.pack "Abe"
putStrLn $ show $ inFilter fa
fb <- readFilter bf numOffsets $ T.pack "Aaaavbe"
putStrLn $ show $ inFilter fb
return ()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment