Skip to content

Instantly share code, notes, and snippets.

@23Skidoo
Created October 22, 2011 08:54
Show Gist options
  • Save 23Skidoo/1305798 to your computer and use it in GitHub Desktop.
Save 23Skidoo/1305798 to your computer and use it in GitHub Desktop.
n-gram
{-# LANGUAGE OverloadedStrings, BangPatterns #-}
import qualified Data.Sequence as S
import qualified Data.Foldable as F
import Control.Monad
import Data.Hashable
import Data.Sequence ((|>), ViewL(..))
import Data.Function (on)
import qualified Data.ByteString.Char8 as B
import qualified Data.HashTable.IO as H
import Data.List
import qualified Data.Char as C
instance Hashable a => Hashable (S.Seq a) where
hash = F.foldl' hashAndCombine stringSalt
hashWithSalt = F.foldl' hashAndCombine
stringSalt :: Int
stringSalt = 5381
hashAndCombine :: Hashable h => Int -> h -> Int
hashAndCombine acc h = acc `combine` hash h
defTableSize :: Int
defTableSize = 400000
insertWith :: (Int -> Int -> Int) -> ByteStringSeq -> Int
-> ByteStringSeqMap -> IO ()
insertWith f k v h = do r <- H.lookup h k
maybe (H.insert h k v)
(\v' -> H.insert h k $ f v v') r
type ByteStringSeq = S.Seq B.ByteString
type ByteStringSeqMap = H.BasicHashTable ByteStringSeq Int
isClassChar :: Char -> Bool
isClassChar a = C.isAlphaNum a || a == ' ' || a == '\''
|| a == '-' || a == '#' || a == '@' || a == '%'
cullWord :: B.ByteString -> B.ByteString
cullWord w = B.map C.toLower $ B.filter isClassChar w
procTextN :: Int -> B.ByteString -> IO [(ByteStringSeq, Int)]
procTextN n t = do h <- H.newSized defTableSize
mapM_ (ngram h) lns
H.toList h
where
!lns = B.lines $ cullWord t
ngram h line = foldM_ breakdown base (B.split ' ' line)
where
base = S.replicate (n-1) ""
breakdown st word = do insertWith (+) stw 1 h
return newStack
where
stw = st |> word
(_ :< (!newStack)) = S.viewl stw
main :: IO ()
main = do
test2 <- B.readFile "canewobble"
proc <- procTextN 3 test2
print . sortBy (flip compare `on` snd) . filter ((<) 100 . snd) $ proc
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment