Skip to content

Instantly share code, notes, and snippets.

@ivant
Created July 10, 2011 05:12
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save ivant/1074297 to your computer and use it in GitHub Desktop.
Save ivant/1074297 to your computer and use it in GitHub Desktop.
wc using Data.Enumerator
.PHONY: all clean test
DATA_REPEATS?=1000
DATA_FILE?=/usr/share/dict/words
GHC_OPTS?=-fspec-constr-count=64 -funfolding-use-threshold=64
all: wce wci
clean:
rm -f wce wci *.hi *.o
test: all
@echo Measuring wc using Data.Enumerator:
@bash -c 'time (for i in `seq $(DATA_REPEATS)`; do cat $(DATA_FILE); done) | ./wce'
@echo
@echo Measuring wc using Data.Iteratee:
@bash -c 'time (for i in `seq $(DATA_REPEATS)`; do cat $(DATA_FILE); done) | ./wci'
@echo
@echo Measuring original wc:
@bash -c 'time (for i in `seq $(DATA_REPEATS)`; do cat $(DATA_FILE); done) | wc'
wce: WCEnumerator.hs
ghc $(GHC_OPTS) -o $@ -O2 --make $<
wci: WCIteratee.hs
ghc $(GHC_OPTS) -o $@ -O2 --make $<
{-# LANGUAGE ScopedTypeVariables, ViewPatterns, OverloadedStrings, BangPatterns #-}
import Control.Applicative
import Control.Exception (SomeException)
import Control.Monad (when, mapM)
import Control.Monad.Trans (lift)
import Data.Char (ord)
import Data.Enumerator hiding (map, mapM, length, filter, foldl')
import qualified Data.Enumerator.List as EL
import qualified Data.Enumerator.Binary as EB
import Data.List (foldl')
import Data.Word (Word8, Word64)
import qualified Data.ByteString as SB
import qualified Data.ByteString.Char8 as SB8
import qualified Data.ByteString.Lazy as LB; import Data.ByteString.Lazy (ByteString)
import System.IO (stdin, hSetBinaryMode)
import Text.Printf (printf)
type Counter = Word64 -- Int is ~3% faster than Word64
type CountingIteratee m = Iteratee SB.ByteString m Counter
countBytes :: forall m. Monad m => CountingIteratee m
countBytes = continue (step 0)
where
step :: Monad m => Counter -> Stream SB.ByteString -> CountingIteratee m
step n _ | n `seq` False = undefined
step n EOF = return n
step n (Chunks cs) = continue (step $ n + fromIntegral (sum (map SB.length cs)))
countBytes' :: forall m. Monad m => CountingIteratee m
countBytes' = EL.fold (\(!c) bs -> c + fromIntegral (SB.length bs)) 0
countWords :: forall m. Monad m => CountingIteratee m
--countWords = continue (step 0 True)
--countWords = countWords' 0
countWords = fst <$> EL.fold countInChunk (0, True)
where
isSpc :: Word8 -> Bool
isSpc c = c `SB.elem` " \t\r\n\f\v\xa0"
countWords' :: Monad m => Counter -> CountingIteratee m
countWords' n | n `seq` False = undefined
countWords' n = do
_ <- EB.takeWhile isSpc
cs <- EB.takeWhile (not . isSpc)
if LB.null cs then
return n
else do
countWords' (n + 1) -- TODO is that properly optimized (do we get a stack overflow?)
step :: Monad m => Counter -> Bool -> Stream SB.ByteString -> CountingIteratee m
step n endedWithSpace _ | n `seq` endedWithSpace `seq` False = undefined
step n _ EOF = return n
step n endedWithSpace (Chunks cs) = continue (uncurry step $ countInChunks n endedWithSpace cs)
countInChunks :: Counter -> Bool -> [SB.ByteString] -> (Counter, Bool)
countInChunks (!n) (!endedWithSpace) cs = foldl' countInChunk (n,endedWithSpace) cs
countInChunk :: (Counter,Bool) -> SB.ByteString -> (Counter, Bool)
countInChunk (!n,!endedWithSpace) c = SB.foldl' (\(!n,!endedWithSpace) c ->
case (endedWithSpace, isSpc c) of
(False, True) -> (n+1, True)
(_, cIsSpace) -> (n, cIsSpace)
) (n,endedWithSpace) c
countLines :: forall m. Monad m => CountingIteratee m
--countLines = countLines' 0
countLines = continue (step 0)
where
-- Proper line endings: \n, \r, \r\n
isEOL :: Word8 -> Bool
isEOL c = c == cr || c == lf
cr, lf :: Word8
cr = fromIntegral (ord '\r')
lf = fromIntegral (ord '\n')
countLines' :: Monad m => Counter -> CountingIteratee m
countLines' n | n `seq` False = undefined
countLines' n = do
_ <- EB.takeWhile (not . isEOL)
eol <- EB.head -- consume the first EOL character
case eol of
Just c | c == cr -> do
c <- EB.head
case c of
Just c | c /= lf -> yield () (Chunks [SB.singleton c])
_ -> return ()
countLines' (n + 1)
Just _ -> countLines' (n + 1)
Nothing -> return n
step :: Monad m => Counter -> Stream SB.ByteString -> CountingIteratee m
step n _ | n `seq` False = undefined
step n EOF = return n
step n (Chunks cs) = continue (step $ n + sum (map (sbCount (==lf)) cs))
sbCount :: (Word8 -> Bool) -> SB.ByteString -> Counter
sbCount p s = SB.foldl' (\a c -> if p c then a+1 else a) 0 s
count :: (a -> Bool) -> [a] -> Int
count p xs = foldl' countOne 0 xs
where
countOne a x | a `seq` False = undefined
countOne a x | p x = a+1
| otherwise = a
fanout :: forall a m r. Monad m => [Iteratee a m r] -> Iteratee a m [Either SomeException r]
fanout iters = do
steps <- mapM (lift . runIteratee) iters
continue (step steps)
where
step :: Monad m
=> [Step a m r] -- ^ list of unfinished iteratees or their results/errors
-> Stream a -- ^ stream piece to process
-> Iteratee a m [Either SomeException r]
step ss stream | countContinues ss == 0 = yield (map extractResult ss) stream
| otherwise = do
ss' <- mapM (stepContinue stream) ss
case stream of
EOF -> return (map extractResult ss')
_ -> continue (step ss')
countContinues :: [Step a m r] -> Int
countContinues = count isContinue
isContinue :: Step a m r -> Bool
isContinue (Continue _) = True
isContinue _ = False
stepContinue :: Monad m => Stream a -> Step a m r -> Iteratee a m (Step a m r)
stepContinue stream (Continue k) = lift $ runIteratee (k stream)
stepContinue _ s = return s
extractResult :: Step a m r -> Either SomeException r
extractResult (Yield r _) = Right r
extractResult (Error e) = Left e
extractResult _ = error "impossible happened"
main = do
hSetBinaryMode stdin True
[Right l,Right w,Right b] <- run_ $ EB.enumHandle (2^12) stdin ==<< fanout [countLines, countWords, countBytes']
printf "%4d %4d %4d\n" l w b
{-# LANGUAGE ScopedTypeVariables, ViewPatterns, OverloadedStrings, BangPatterns #-}
import Control.Applicative
import Control.Exception (SomeException)
import Control.Monad (when, mapM)
import Control.Monad.Trans (lift)
import Data.Char (ord)
import Data.Iteratee as I hiding (foldl')
import Data.Iteratee.IO
import qualified Data.Iteratee.ListLike as IL
import Data.List (foldl')
import Data.Word (Word8, Word64)
import qualified Data.ByteString as SB
import qualified Data.ByteString.Char8 as SB8
import qualified Data.ByteString.Lazy as LB; import Data.ByteString.Lazy (ByteString)
import qualified Data.ByteString.Lazy.Char8 as LB8
import System.IO (stdin, hSetBinaryMode)
import Text.Printf (printf)
type Counter = Word64 -- Int is ~3% faster than Word64
type CountingIteratee m = Iteratee SB.ByteString m Counter
countBytes :: forall m. Monad m => CountingIteratee m
countBytes = icont (step 0) Nothing
where
step :: Monad m => Counter -> Stream SB.ByteString -> CountingIteratee m
step n s | n `seq` False = undefined
step n s@(EOF _) = idone n s
step n (Chunk c) = icont (step $ n + fromIntegral (SB.length c)) Nothing
{-
countBytes' :: forall m. Monad m => CountingIteratee m
countBytes' = IL.foldl' (\(!c) bs -> c + fromIntegral (SB.length bs)) 0
-}
countWords :: forall m. Monad m => CountingIteratee m
--countWords = countWords' 0
countWords = icont (step 0 True) Nothing
--countWords = fst <$> IL.foldl' countInChunk (0, True)
where
isSpc :: Word8 -> Bool
isSpc c = c `SB.elem` " \t\r\n\f\v\xa0"
-- The break function returns strict bytestrings => it concatenates them while it gets the prefix.
-- This appears to be more efficient (~3.3x) than using lazy bytestrings on the word data, probably
-- because the words/space runs are generally short and it's cheaper to concatenate than to allocate
-- a sequence of lazy bytestring chunks (and then traverse them).
-- Still, Data.Enumerator (lazy bytestring version) is ~2.2x slower than lazy bytestring version of the
-- Data.Iteratee. (Is it a bug?)
countWords' :: Monad m => Counter -> CountingIteratee m
countWords' n | n `seq` False = undefined
countWords' n = do
_ <- I.break (not . isSpc)
cs <- I.break isSpc
if SB.null cs then
return n
else do
countWords' (n + 1) -- TODO is that properly optimized (do we get a stack overflow?)
step :: Monad m => Counter -> Bool -> Stream SB.ByteString -> CountingIteratee m
step n endedWithSpace _ | n `seq` endedWithSpace `seq` False = undefined
step n _ s@(EOF _) = idone n s
step n endedWithSpace (Chunk c) = icont (uncurry step $ countInChunk (n,endedWithSpace) c) Nothing
countInChunk :: (Counter,Bool) -> SB.ByteString -> (Counter, Bool)
countInChunk (n,endedWithSpace) c = SB.foldl' (\(!n,endedWithSpace) c ->
case (endedWithSpace, isSpc c) of
(False, True) -> (n+1, True)
(_, cIsSpace) -> (n, cIsSpace)
) (n,endedWithSpace) c
countLines :: forall m. Monad m => CountingIteratee m
--countLines = countLines' 0
countLines = icont (step 0) Nothing
where
-- Proper line endings: \n, \r, \r\n
isEOL :: Word8 -> Bool
isEOL c = c == cr || c == lf
cr, lf :: Word8
cr = fromIntegral (ord '\r')
lf = fromIntegral (ord '\n')
countLines' :: Monad m => Counter -> CountingIteratee m
countLines' n | n `seq` False = undefined
countLines' n = do
_ <- I.break isEOL
eol <- I.checkErr I.head -- consume the first EOL character
case eol of
Right c | c == cr -> do
c <- I.checkErr I.head
case c of
Right c | c /= lf -> idone () (Chunk (SB.singleton c))
_ -> return ()
countLines' (n + 1)
Right _ -> countLines' (n + 1)
_ -> return n
step :: Monad m => Counter -> Stream SB.ByteString -> CountingIteratee m
step n _ | n `seq` False = undefined
step n s@(EOF _) = idone n s
step n (Chunk c) = icont (step $ n + sbCount (==lf) c) Nothing
sbCount :: (Word8 -> Bool) -> SB.ByteString -> Counter
sbCount p s = SB.foldl' (\a c -> if p c then a+1 else a) 0 s
-- eneeCheckIfDone :: (Monad m, NullPoint elo) => ((Stream eli -> Iteratee eli m a) -> Iteratee elo m (Iteratee eli m a)) -> Enumeratee elo eli m a
-- liftI . step :: Monad m => (Stream eli -> Iteratee eli m a) -> Iteratee elo m (Iteratee eli m a)
-- liftI :: Monad m => (Stream elo -> Iteratee elo m (Iteratee eli m a)) -> Iteratee elo m (Iteratee eli m a)
-- step :: Monad m => Stream elo -> Iteratee elo m (Iteratee eli m a)
--
count :: (a -> Bool) -> [a] -> Int
count p xs = foldl' countOne 0 xs
where
countOne a x | a `seq` False = undefined
countOne a x | p x = a+1
| otherwise = a
data FanoutState a m r = FanoutIter { fanoutIter :: Iteratee a m r }
| FanoutErr { fanoutErr :: SomeException }
| FanoutRes { fanoutRes :: r }
fanout :: forall a m r. (Show a, Nullable a, Monad m) => [Iteratee a m r] -> Iteratee a m [Either SomeException r]
fanout iters = icont (step (map FanoutIter iters)) Nothing
where
step :: Monad m => [FanoutState a m r] -> Stream a -> Iteratee a m [Either SomeException r]
step iters stream | countIters iters == 0 = idone (map extractResult iters) stream
| otherwise = do
iters' <- mapM (stepOne stream) iters
case stream of
(EOF _) -> step iters' stream
_ -> icont (step iters') Nothing
extractResult :: FanoutState a m r -> Either SomeException r
extractResult (FanoutRes r) = Right r
extractResult (FanoutErr e) = Left e
extractResult _ = error "impossible happened"
countIters :: [FanoutState a m r] -> Int
countIters = count isFanoutIter
isFanoutIter :: FanoutState a m r -> Bool
isFanoutIter (FanoutIter _) = True
isFanoutIter _ = False
stepOne :: Monad m => Stream a -> FanoutState a m r -> Iteratee a m (FanoutState a m r)
stepOne _ f@(FanoutRes _) = return f
stepOne _ f@(FanoutErr _) = return f
stepOne stream (FanoutIter iter) = do
iter' <- lift $ runIter iter onDone onCont
case (stream, iter') of
(EOF _, FanoutIter iter') -> lift $ runIter iter' onDone (error "divergent iteratee")
_ -> return iter'
where
onDone :: Monad m => r -> Stream a -> m (FanoutState a m r)
onDone r _ = return $ FanoutRes r
onCont :: Monad m => (Stream a -> Iteratee a m r) -> Maybe SomeException -> m (FanoutState a m r)
onCont _ (Just err) = undefined --return $ FanoutErr err
onCont k Nothing = return $ FanoutIter (k stream)
main = do
hSetBinaryMode stdin True
[Right l,Right w,Right b] <- run $ joinIM $ enumHandle (2^12) stdin $ fanout [countLines, countWords, countBytes]
printf "%4d %4d %4d\n" l w b
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment