bradclawsie/sha1_bloom_filter.hs

## sha1_bloom_filter.hs
module Main where

import qualified Crypto.Hash.SHA1             as SHA1
import qualified Data.Text                    as T
import qualified Data.Text.Encoding           as TE
import qualified Data.ByteString              as B
import qualified Text.Printf                  as P
import qualified Data.Word                    as W
import qualified Data.List.Split              as S
import qualified Foreign                      as F
import qualified System.IO.Unsafe             as UNSAFE
import qualified Control.Monad.Primitive      as P
import qualified Data.Vector.Unboxed          as U
import qualified Data.Vector.Unboxed.Mutable  as UM
import qualified Control.Monad                as C
import qualified Data.ByteString.UTF8         as UTF8
import qualified Data.Vector                  as V


-- get a printable representation of the hash
-- borrowed from:
-- http://stackoverflow.com/questions/9502777/sha1-encoding-in-haskell
-- use as (putStrLn . toHex) h
-- for some hash h
toHex :: B.ByteString -> String
toHex bytes = B.unpack bytes >>= P.printf "%02x"

-- simpe alias
word32ToInt :: F.Word32 -> Int
word32ToInt = fromIntegral

-- borrowed from:
-- http://hackage.haskell.org/packages/archive/random-source/latest/doc/html/src/Data-Random-Internal-Words.html#buildWord32
buildWord32 :: W.Word8 -> W.Word8 -> W.Word8 -> W.Word8 -> W.Word32
buildWord32 b0 b1 b2 b3
  = UNSAFE.unsafePerformIO . F.allocaBytes 4 $ \p -> do
    F.pokeByteOff p 0 b0
    F.pokeByteOff p 1 b1
    F.pokeByteOff p 2 b2
    F.pokeByteOff p 3 b3
    F.peek (F.castPtr p)

-- transform the sha1 hash bytes into five Ints
-- numOffsets is the number of 32bitInt chunks to use. since there
-- are only five in total, any number provided here larger than five
-- will result in all
chunksWord32Int :: B.ByteString -> Int -> [Int]
chunksWord32Int h numOffsets =
  -- the sha1 hash is composed of twenty Word8s (160 bits).
  -- we split group them by four (making five groups). four since 4*8 = 32,
  -- and we want 32bit Ints
  let word8sby4 = S.splitEvery 4 $ B.unpack h in
  -- make each into a Word32, which we then coerce into its Int represention
  take numOffsets $ map toWord32Int word8sby4
  -- make this is private function since we are not checking to see if the
  -- list of word8s is of length four (we can assume so here, but generally
  -- this would make the function brittle were it top-level)
  where
    toWord32Int word8s =
      word32ToInt $ buildWord32 (word8s!!0) (word8s!!1) (word8s!!2) (word8s!!3)

-- transform the sha1 hash into a list of numOffsets offsets into the filter
filterOffsets :: B.ByteString -> Int -> Int -> [Int]
filterOffsets h n numOffsets =
  -- given the sha1 hash, break it into five Word32Ints
  -- and then get the modulo offsets in the bloom filter
  map (`mod` n) $ chunksWord32Int h numOffsets

-- construct an initially empty (all False) bloom filter of length n
initialFilter :: P.PrimMonad m => Int -> m (UM.MVector (P.PrimState m) Bool)
initialFilter n = UM.replicate n False

-- build the sha1 hash for the text
toSHA1 :: T.Text -> B.ByteString
toSHA1 t = SHA1.hash $ TE.encodeUtf8 t

-- given a Text input, compute the sha1 hash, get the first numOffsets
-- filter offsets for the  constituent Word32Ints, and write the
-- bloom filter to True at those offsets.
-- we know that all offsets are within the bounds of the vector given the
-- modulus calculation in filterOffsets
writeFilter :: P.PrimMonad m => UM.MVector (P.PrimState m) Bool -> Int -> T.Text -> m ()
writeFilter bf numOffsets t = let h = toSHA1 t
                                  l = UM.length bf
                                  offsets = filterOffsets h l numOffsets in
                              C.mapM_ (\i -> UM.write bf i True) offsets

-- given a Text input, compute the sha1 hash, get the first numOffsets
-- filter offsets for the constituent Word32Ints, and read the
-- bloom filter values at those offsets.
-- we know that all offsets are within the bounds of the vector given the
-- modulus calculation in filterOffsets
readFilter :: (P.PrimMonad m, UM.Unbox b) => UM.MVector (P.PrimState m) b -> Int -> T.Text -> m [b]
readFilter bf numOffsets t = let h = toSHA1 t
                                 l = UM.length bf
                                 offsets = filterOffsets h l numOffsets in
                             C.mapM (\i -> UM.read bf i) offsets

-- alias for the test for membership, apply to results of readFilter
inFilter :: [Bool] -> Bool
inFilter = all (== True)

-- testing function
tryFilter :: P.PrimMonad m => UM.MVector (P.PrimState m) Bool -> Int -> UTF8.ByteString -> m (T.Text, Bool)
tryFilter bf numOffsets bs = let t = TE.decodeUtf8 bs in
  do
    rf <- readFilter bf numOffsets t
    case inFilter rf of
      True -> return (t,False)
      _ -> writeFilter bf numOffsets t >> return (t,True)

main :: IO ()
main = do
  words <- C.liftM (V.fromList . UTF8.lines) $ B.readFile "/etc/dictionaries-common/words"
  let n = 3000000
  let numOffsets = 5 :: Int -- numOffsets == number of hashes to use (1..5)
  bf <- initialFilter n
  rs <- C.mapM (tryFilter bf numOffsets) (V.toList words)
  let collisions = filter (\i -> (snd i) == False) rs
  putStrLn $ show collisions
  fa <- readFilter bf numOffsets $ T.pack "Abe"
  putStrLn $ show $ inFilter fa
  fb <- readFilter bf numOffsets $ T.pack "Aaaavbe"
  putStrLn $ show $ inFilter fb
  return ()
	module Main where

	import qualified Crypto.Hash.SHA1 as SHA1
	import qualified Data.Text as T
	import qualified Data.Text.Encoding as TE
	import qualified Data.ByteString as B
	import qualified Text.Printf as P
	import qualified Data.Word as W
	import qualified Data.List.Split as S
	import qualified Foreign as F
	import qualified System.IO.Unsafe as UNSAFE
	import qualified Control.Monad.Primitive as P
	import qualified Data.Vector.Unboxed as U
	import qualified Data.Vector.Unboxed.Mutable as UM
	import qualified Control.Monad as C
	import qualified Data.ByteString.UTF8 as UTF8
	import qualified Data.Vector as V


	-- get a printable representation of the hash
	-- borrowed from:
	-- http://stackoverflow.com/questions/9502777/sha1-encoding-in-haskell
	-- use as (putStrLn . toHex) h
	-- for some hash h
	toHex :: B.ByteString -> String
	toHex bytes = B.unpack bytes >>= P.printf "%02x"

	-- simpe alias
	word32ToInt :: F.Word32 -> Int
	word32ToInt = fromIntegral

	-- borrowed from:
	-- http://hackage.haskell.org/packages/archive/random-source/latest/doc/html/src/Data-Random-Internal-Words.html#buildWord32
	buildWord32 :: W.Word8 -> W.Word8 -> W.Word8 -> W.Word8 -> W.Word32
	buildWord32 b0 b1 b2 b3
	= UNSAFE.unsafePerformIO . F.allocaBytes 4 $ \p -> do
	F.pokeByteOff p 0 b0
	F.pokeByteOff p 1 b1
	F.pokeByteOff p 2 b2
	F.pokeByteOff p 3 b3
	F.peek (F.castPtr p)

	-- transform the sha1 hash bytes into five Ints
	-- numOffsets is the number of 32bitInt chunks to use. since there
	-- are only five in total, any number provided here larger than five
	-- will result in all
	chunksWord32Int :: B.ByteString -> Int -> [Int]
	chunksWord32Int h numOffsets =
	-- the sha1 hash is composed of twenty Word8s (160 bits).
	-- we split group them by four (making five groups). four since 4*8 = 32,
	-- and we want 32bit Ints
	let word8sby4 = S.splitEvery 4 $ B.unpack h in
	-- make each into a Word32, which we then coerce into its Int represention
	take numOffsets $ map toWord32Int word8sby4
	-- make this is private function since we are not checking to see if the
	-- list of word8s is of length four (we can assume so here, but generally
	-- this would make the function brittle were it top-level)
	where
	toWord32Int word8s =
	word32ToInt $ buildWord32 (word8s!!0) (word8s!!1) (word8s!!2) (word8s!!3)

	-- transform the sha1 hash into a list of numOffsets offsets into the filter
	filterOffsets :: B.ByteString -> Int -> Int -> [Int]
	filterOffsets h n numOffsets =
	-- given the sha1 hash, break it into five Word32Ints
	-- and then get the modulo offsets in the bloom filter
	map (`mod` n) $ chunksWord32Int h numOffsets

	-- construct an initially empty (all False) bloom filter of length n
	initialFilter :: P.PrimMonad m => Int -> m (UM.MVector (P.PrimState m) Bool)
	initialFilter n = UM.replicate n False

	-- build the sha1 hash for the text
	toSHA1 :: T.Text -> B.ByteString
	toSHA1 t = SHA1.hash $ TE.encodeUtf8 t

	-- given a Text input, compute the sha1 hash, get the first numOffsets
	-- filter offsets for the constituent Word32Ints, and write the
	-- bloom filter to True at those offsets.
	-- we know that all offsets are within the bounds of the vector given the
	-- modulus calculation in filterOffsets
	writeFilter :: P.PrimMonad m => UM.MVector (P.PrimState m) Bool -> Int -> T.Text -> m ()
	writeFilter bf numOffsets t = let h = toSHA1 t
	l = UM.length bf
	offsets = filterOffsets h l numOffsets in
	C.mapM_ (\i -> UM.write bf i True) offsets

	-- given a Text input, compute the sha1 hash, get the first numOffsets
	-- filter offsets for the constituent Word32Ints, and read the
	-- bloom filter values at those offsets.
	-- we know that all offsets are within the bounds of the vector given the
	-- modulus calculation in filterOffsets
	readFilter :: (P.PrimMonad m, UM.Unbox b) => UM.MVector (P.PrimState m) b -> Int -> T.Text -> m [b]
	readFilter bf numOffsets t = let h = toSHA1 t
	l = UM.length bf
	offsets = filterOffsets h l numOffsets in
	C.mapM (\i -> UM.read bf i) offsets

	-- alias for the test for membership, apply to results of readFilter
	inFilter :: [Bool] -> Bool
	inFilter = all (== True)

	-- testing function
	tryFilter :: P.PrimMonad m => UM.MVector (P.PrimState m) Bool -> Int -> UTF8.ByteString -> m (T.Text, Bool)
	tryFilter bf numOffsets bs = let t = TE.decodeUtf8 bs in
	do
	rf <- readFilter bf numOffsets t
	case inFilter rf of
	True -> return (t,False)
	_ -> writeFilter bf numOffsets t >> return (t,True)

	main :: IO ()
	main = do
	words <- C.liftM (V.fromList . UTF8.lines) $ B.readFile "/etc/dictionaries-common/words"
	let n = 3000000
	let numOffsets = 5 :: Int -- numOffsets == number of hashes to use (1..5)
	bf <- initialFilter n
	rs <- C.mapM (tryFilter bf numOffsets) (V.toList words)
	let collisions = filter (\i -> (snd i) == False) rs
	putStrLn $ show collisions
	fa <- readFilter bf numOffsets $ T.pack "Abe"
	putStrLn $ show $ inFilter fa
	fb <- readFilter bf numOffsets $ T.pack "Aaaavbe"
	putStrLn $ show $ inFilter fb
	return ()