Skip to content

Instantly share code, notes, and snippets.

@cartazio
Created June 10, 2012 00:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save cartazio/2903178 to your computer and use it in GitHub Desktop.
Save cartazio/2903178 to your computer and use it in GitHub Desktop.
vowpal hash
\begin{code}
{-# LANGUAGE BangPatterns #-}
{-|
Module : Data.Digest.VowpalHash
Copyright : (c) Carter Tazio Schonwald
License : BSD-style
Maintainer : first dot last @gmail.com
Stability : experimental
Portability : portable
based on murmur-hash2 implementation in the murmur-hash package
and also
-}
module Data.Digest.VowpalHash where
import Data.Word
--import Data.Digest.Murmur32 -- (asWord32,hash32AddWord32,Hash32)
import Data.Int
import Data.Bits
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as C (pack,unpack)
import Data.Foldable (foldl',Foldable)
-- hash32end and hash32AddWord32 are from the hackage murmur-hash lib
liftWord :: Word8 ->Word32
liftWord = fromIntegral
hash32End :: Word32 -> Word32
hash32End ( h) =
let h1 = h `xor` (h `shiftR` 13)
h2 = h1 * murmur_m
h3 = h2 `xor` (h2 `shiftR` 15)
in h3
bStringAsWords :: B.ByteString -> ([Word32], Maybe Word32)
bStringAsWords str = chew [] $! B.unpack $! str
where
chew :: [Word32] -> [Word8] -> ([Word32], Maybe Word32)
chew sofar [] = (reverse sofar, Nothing)
chew sofar [a] = (reverse sofar, Just $ adjust [a])
chew sofar [a,b] = (reverse sofar, Just $ adjust [a,b])
chew sofar [a,b,c] = (reverse sofar , Just $ adjust [a,b,c])
chew sofar (a:b:c:d : rest) = chew (quadChange (a,b,c,d) : sofar) rest -- something wrong happens here?
quadChange :: (Word8,Word8,Word8,Word8)->Word32
quadChange (a,b,c,d)=fromIntegral a + ( fromIntegral b `shiftL` 8) +
(fromIntegral c `shiftL` 16 ) + (fromIntegral d `shiftL` 24)
adjust :: [Word8]-> Word32
--adjust [a,b,c,d] = fromIntegral a + ( fromIntegral b `shiftL` 8) +
-- (fromIntegral c `shiftL` 16 ) + (fromIntegral d `shiftL` 24)
adjust [a,b,c] = fromIntegral a + ( fromIntegral b `shiftL` 8) +
(fromIntegral c `shiftL` 16 )
adjust [a,b] = fromIntegral a + ( fromIntegral b `shiftL` 8)
adjust [a] = fromIntegral a
adjust _ = error "error, input must be a list with length 1-4"
--- takes next word & current seed -> new seed
hash32AddWord32 :: Word32 -> Word32 -> Word32
hash32AddWord32 k ( h) =
let k1 = k * murmur_m
k2 = k1 `xor` (k1 `shiftR` murmur_r)
k3 = k2 * murmur_m
h1 = h * murmur_m
h2 = h1 `xor` k3
in h2
murmur_m :: Word32
murmur_m = 0x5bd1e995
murmur_r :: Int --- this is just a shift, it doesn't need to be int32
murmur_r = 24
makeMask :: Int -> Word32
makeMask ind = ( one `shiftL` ind) - 1
where one :: Word32
one =1
hashBase :: Word32
hashBase = 97562527
hashWithSeed :: Word32 -> B.ByteString -> Word32
hashWithSeed seed str = hash32End lastState -- .&. makeMask 18 -- default size is 18
where
h = seed `xor` ( fromIntegral $! B.length str)
(ls, optRem)= bStringAsWords str
semilastState= hash32AddFoldable ls h
lastState =
case optRem of
Nothing -> semilastState -- * murmur_m --- not sure if mul is needed here
Just a -> (semilastState `xor` a ) * murmur_m
hash32AddFoldable :: ( Foldable c) => c Word32-> Word32->Word32
hash32AddFoldable c !h0 = foldl' f h0 c
where f h a = hash32AddWord32 a h
--this seems correct
hashFeature :: String ->Word32
hashFeature str =( hashWithSeed 0 $ C.pack str) .&. makeMask 18
hashClass :: String -> Word32
hashClass str = hashWithSeed hashBase $ C.pack str
--- hash for the constant feature,
constantHash :: Word32
constantHash = 11650396 .&. makeMask 18
{-const int quadratic_constant = 27942141;
const int constant = 11650396;
need to check if
-}
-- HashFeatureClass :: CLASS , FEATURENAME-> hash
-- if theres no class name or " ", use HashFeature instead!!
hashFeatureClass :: String -> String -> Word32
hashFeatureClass clss feature = (hashWithSeed (hashClass clss) $!
C.pack feature ) .&. makeMask 18
\end{code}
now lets figure out how to do the same with quadratic features!
const int quadratic_constant = 27942141;
88: size_t stride = global.stride; this doesn't matter actually
size_t halfhash = quadratic_constant * page_feature.weight_index;
HASH == (halfhash + ele->weight_index) & mask
\begin{code}
quadraticConstant :: Word32
quadraticConstant = 27942141
-- pairs of features appear twice in general, need to
-- hashFeaturePair (a,b) (c,d) != hashFeaturePair (c,d) (a,b)
-- (Class,Feature)
hashFeaturePair :: (String,String)-> (String,String)-> Word32
hashFeaturePair (classa,stra) (classb,strb) = ( hashFeatureClass classa stra * quadraticConstant + hashFeatureClass classb strb ) .&. makeMask 18
\end{code}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment