public
Last active

n-gram

  • Download Gist
HashTable.hs
Haskell
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
{-# LANGUAGE OverloadedStrings, BangPatterns #-}
import qualified Data.Sequence as S
import qualified Data.Foldable as F
import Control.Monad
import Data.Hashable
import Data.Sequence ((|>), ViewL(..))
import Data.Function (on)
import qualified Data.ByteString.Char8 as B
import qualified Data.HashTable.IO as H
import Data.List
import qualified Data.Char as C
 
instance Hashable a => Hashable (S.Seq a) where
hash = F.foldl' hashAndCombine stringSalt
hashWithSalt = F.foldl' hashAndCombine
 
stringSalt :: Int
stringSalt = 5381
 
hashAndCombine :: Hashable h => Int -> h -> Int
hashAndCombine acc h = acc `combine` hash h
 
defTableSize :: Int
defTableSize = 400000
 
insertWith :: (Int -> Int -> Int) -> ByteStringSeq -> Int
-> ByteStringSeqMap -> IO ()
insertWith f k v h = do r <- H.lookup h k
maybe (H.insert h k v)
(\v' -> H.insert h k $ f v v') r
 
 
type ByteStringSeq = S.Seq B.ByteString
type ByteStringSeqMap = H.BasicHashTable ByteStringSeq Int
 
isClassChar :: Char -> Bool
isClassChar a = C.isAlphaNum a || a == ' ' || a == '\''
|| a == '-' || a == '#' || a == '@' || a == '%'
 
cullWord :: B.ByteString -> B.ByteString
cullWord w = B.map C.toLower $ B.filter isClassChar w
 
procTextN :: Int -> B.ByteString -> IO [(ByteStringSeq, Int)]
procTextN n t = do h <- H.newSized defTableSize
mapM_ (ngram h) lns
H.toList h
where
!lns = B.lines $ cullWord t
ngram h line = foldM_ breakdown base (B.split ' ' line)
where
base = S.replicate (n-1) ""
breakdown st word = do insertWith (+) stw 1 h
return newStack
where
stw = st |> word
(_ :< (!newStack)) = S.viewl stw
 
main :: IO ()
main = do
test2 <- B.readFile "canewobble"
proc <- procTextN 3 test2
print . sortBy (flip compare `on` snd) . filter ((<) 100 . snd) $ proc

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.