Created
June 30, 2012 07:07
-
-
Save bradclawsie/3022745 to your computer and use it in GitHub Desktop.
sha1_bloom_filter.hs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Main where | |
import qualified Crypto.Hash.SHA1 as SHA1 | |
import qualified Data.Text as T | |
import qualified Data.Text.Encoding as TE | |
import qualified Data.ByteString as B | |
import qualified Text.Printf as P | |
import qualified Data.Word as W | |
import qualified Data.List.Split as S | |
import qualified Foreign as F | |
import qualified System.IO.Unsafe as UNSAFE | |
import qualified Control.Monad.Primitive as P | |
import qualified Data.Vector.Unboxed as U | |
import qualified Data.Vector.Unboxed.Mutable as UM | |
import qualified Control.Monad as C | |
import qualified Data.ByteString.UTF8 as UTF8 | |
import qualified Data.Vector as V | |
-- get a printable representation of the hash | |
-- borrowed from: | |
-- http://stackoverflow.com/questions/9502777/sha1-encoding-in-haskell | |
-- use as (putStrLn . toHex) h | |
-- for some hash h | |
toHex :: B.ByteString -> String | |
toHex bytes = B.unpack bytes >>= P.printf "%02x" | |
-- simpe alias | |
word32ToInt :: F.Word32 -> Int | |
word32ToInt = fromIntegral | |
-- borrowed from: | |
-- http://hackage.haskell.org/packages/archive/random-source/latest/doc/html/src/Data-Random-Internal-Words.html#buildWord32 | |
buildWord32 :: W.Word8 -> W.Word8 -> W.Word8 -> W.Word8 -> W.Word32 | |
buildWord32 b0 b1 b2 b3 | |
= UNSAFE.unsafePerformIO . F.allocaBytes 4 $ \p -> do | |
F.pokeByteOff p 0 b0 | |
F.pokeByteOff p 1 b1 | |
F.pokeByteOff p 2 b2 | |
F.pokeByteOff p 3 b3 | |
F.peek (F.castPtr p) | |
-- transform the sha1 hash bytes into five Ints | |
-- numOffsets is the number of 32bitInt chunks to use. since there | |
-- are only five in total, any number provided here larger than five | |
-- will result in all | |
chunksWord32Int :: B.ByteString -> Int -> [Int] | |
chunksWord32Int h numOffsets = | |
-- the sha1 hash is composed of twenty Word8s (160 bits). | |
-- we split group them by four (making five groups). four since 4*8 = 32, | |
-- and we want 32bit Ints | |
let word8sby4 = S.splitEvery 4 $ B.unpack h in | |
-- make each into a Word32, which we then coerce into its Int represention | |
take numOffsets $ map toWord32Int word8sby4 | |
-- make this is private function since we are not checking to see if the | |
-- list of word8s is of length four (we can assume so here, but generally | |
-- this would make the function brittle were it top-level) | |
where | |
toWord32Int word8s = | |
word32ToInt $ buildWord32 (word8s!!0) (word8s!!1) (word8s!!2) (word8s!!3) | |
-- transform the sha1 hash into a list of numOffsets offsets into the filter | |
filterOffsets :: B.ByteString -> Int -> Int -> [Int] | |
filterOffsets h n numOffsets = | |
-- given the sha1 hash, break it into five Word32Ints | |
-- and then get the modulo offsets in the bloom filter | |
map (`mod` n) $ chunksWord32Int h numOffsets | |
-- construct an initially empty (all False) bloom filter of length n | |
initialFilter :: P.PrimMonad m => Int -> m (UM.MVector (P.PrimState m) Bool) | |
initialFilter n = UM.replicate n False | |
-- build the sha1 hash for the text | |
toSHA1 :: T.Text -> B.ByteString | |
toSHA1 t = SHA1.hash $ TE.encodeUtf8 t | |
-- given a Text input, compute the sha1 hash, get the first numOffsets | |
-- filter offsets for the constituent Word32Ints, and write the | |
-- bloom filter to True at those offsets. | |
-- we know that all offsets are within the bounds of the vector given the | |
-- modulus calculation in filterOffsets | |
writeFilter :: P.PrimMonad m => UM.MVector (P.PrimState m) Bool -> Int -> T.Text -> m () | |
writeFilter bf numOffsets t = let h = toSHA1 t | |
l = UM.length bf | |
offsets = filterOffsets h l numOffsets in | |
C.mapM_ (\i -> UM.write bf i True) offsets | |
-- given a Text input, compute the sha1 hash, get the first numOffsets | |
-- filter offsets for the constituent Word32Ints, and read the | |
-- bloom filter values at those offsets. | |
-- we know that all offsets are within the bounds of the vector given the | |
-- modulus calculation in filterOffsets | |
readFilter :: (P.PrimMonad m, UM.Unbox b) => UM.MVector (P.PrimState m) b -> Int -> T.Text -> m [b] | |
readFilter bf numOffsets t = let h = toSHA1 t | |
l = UM.length bf | |
offsets = filterOffsets h l numOffsets in | |
C.mapM (\i -> UM.read bf i) offsets | |
-- alias for the test for membership, apply to results of readFilter | |
inFilter :: [Bool] -> Bool | |
inFilter = all (== True) | |
-- testing function | |
tryFilter :: P.PrimMonad m => UM.MVector (P.PrimState m) Bool -> Int -> UTF8.ByteString -> m (T.Text, Bool) | |
tryFilter bf numOffsets bs = let t = TE.decodeUtf8 bs in | |
do | |
rf <- readFilter bf numOffsets t | |
case inFilter rf of | |
True -> return (t,False) | |
_ -> writeFilter bf numOffsets t >> return (t,True) | |
main :: IO () | |
main = do | |
words <- C.liftM (V.fromList . UTF8.lines) $ B.readFile "/etc/dictionaries-common/words" | |
let n = 3000000 | |
let numOffsets = 5 :: Int -- numOffsets == number of hashes to use (1..5) | |
bf <- initialFilter n | |
rs <- C.mapM (tryFilter bf numOffsets) (V.toList words) | |
let collisions = filter (\i -> (snd i) == False) rs | |
putStrLn $ show collisions | |
fa <- readFilter bf numOffsets $ T.pack "Abe" | |
putStrLn $ show $ inFilter fa | |
fb <- readFilter bf numOffsets $ T.pack "Aaaavbe" | |
putStrLn $ show $ inFilter fb | |
return () |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment