Skip to content

Instantly share code, notes, and snippets.

@michaelt
Created February 20, 2014 20:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save michaelt/9122379 to your computer and use it in GitHub Desktop.
Save michaelt/9122379 to your computer and use it in GitHub Desktop.
parser zoom trouble
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE BangPatterns#-}
{-# LANGUAGE RankNTypes #-}
import Blaze.ByteString.Builder (Builder, fromByteString, toByteString)
import Control.Exception (Exception)
import Control.Monad.Trans.Class (lift)
import Data.ByteString (ByteString)
import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as L
import Data.Monoid
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Encoding as TEE
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Encoding as TLE
import Pipes
import Pipes.Parse
import qualified Pipes.Prelude as PP
import qualified Pipes.ByteString as Bytes
import qualified Pipes.Text as Txt
import Pipes.Text.Encoding (utf8)
import Control.Lens
import Control.Lens.Internal.Zoom
import Control.Monad
import qualified System.IO as IO
import Control.Monad.Trans.Maybe
import Control.Monad.State.Class
main :: IO ()
main = do -- S.writeFile fp $ contents 10000 -- 10000 cannot be handled fileParser0 and 1
-- parse_file fileParser0 -- pathological
-- parse_file fileParser1 -- programs
parse_file fileParser2 -- good program
where
parse_file parser = IO.withBinaryFile fp IO.ReadMode $ \h ->
do p' <- runEffect $ parseWith parser ( Bytes.fromHandle h ) >-> PP.print
runEffect $ p' >-> PP.print
parseWith parser = loop where
loop p = do (m,p') <- lift (runStateT (runMaybeT parser) p)
case m of Nothing -> return p'
Just file -> do yield file
loop p'
fp = "encoded.fileformat"
contents n = (toByteString . mconcat . replicate n . encodeFiles) input
<> S.pack (replicate 10 250)
-- infix lens nonsense
infixr 1 ~~>
(~~>) :: Zoom m n s t
=> ((s -> Zoomed n c s) -> t -> Zoomed n c t)
-> m c -> n c
(~~>) = zoom
{-# INLINE (~~>) #-}
fileParser0, fileParser1, fileParser2 :: Monad m => MaybeT (StateT (Producer ByteString m x) m) File
fileParser0 = do (name, len) <- zoom utf8 parseText
contents <- zoom (Bytes.splitAt len) (lift drawAll)
return (File name (S.concat contents))
where
-- this parser aggregates all Text parsing into one preliminary parser
-- which is then applied with `zoom utf8`
-- we cannot tell in advance how long, e.g. the file name will be
parseText :: Monad m => MaybeT (StateT (Producer Text m x) m) (Text, Int)
parseText = do nameLength <- parseNumber
names <- zoom (Txt.splitAt nameLength) $ (lift drawAll)
contentLength <- parseNumber
return $! (T.concat names, contentLength)
-- here we disaggregate the little Text parsers but still apply them with `zoom utf8`
-- this makes no difference
fileParser1 = do nameLength <- utf8 ~~> parseNumber
names <- utf8 ~~> Txt.splitAt nameLength ~~> lift drawAll
contentLength <- utf8 ~~> parseNumber
contents <- Bytes.splitAt contentLength ~~> lift drawAll
return (File (T.concat names) (S.concat contents))
-- this is the good program; be reflecting on the fact that file names
-- should not be a 1000 bytes long, and binary files longer than e.g. 10 ^ 10
-- we can restrict the length of the byte stream to which we apply `zoom utf8`
fileParser2 = do nameLength <- Bytes.splitAt 3 ~~> utf8 ~~> parseNumber
names <- Bytes.splitAt nameLength ~~> utf8 ~~> lift drawAll
len <- Bytes.splitAt 10 ~~> utf8 ~~> parseNumber
contents <- Bytes.splitAt len ~~> lift drawAll
return (File (T.concat names) (S.concat contents))
parseNumber :: Monad m => MaybeT (StateT (Producer Text m x) m) Int
parseNumber = loop 0 where
loop !n = do c <- MaybeT Txt.drawChar
case c of ':' -> return n
_ -> do guard ('0' <= c && c <= '9')
loop $! n * 10 + (fromEnum c - fromEnum '0')
-- --- Michael S's `File` type and its binary encoding, etc.
data File = File
{ fileName :: !Text
, fileContents :: !ByteString
}
deriving Show
encodeFile :: File -> Builder
encodeFile (File name contents) =
tellLength (S.length bytesname) <>
fromByteString bytesname <>
tellLength (S.length contents) <>
fromByteString contents
where
tellLength i = fromByteString $ TEE.encodeUtf8 (T.pack (shows i ":"))
bytesname = TEE.encodeUtf8 name
encodeFiles :: [File] -> Builder
encodeFiles = mconcat . map encodeFile
input :: [File]
input =
[ File "utf8.txt" $ TEE.encodeUtf8 "This file is in UTF-8"
, File "utf16.txt" $ TEE.encodeUtf16LE "This file is in UTF-16"
, File "binary.dat" "we'll pretend to be binary"
]
---
-- This desperate scheme actually has some efficacy, if used before `utf8` in a zoom
-- but not much
chunk :: Monad m => Int -> Lens' (Producer ByteString m r) (Producer ByteString m r)
chunk n = lens (chunkyN n) (\_ b -> b) where
chunkyN :: Monad m => Int -> Producer ByteString m r -> Producer ByteString m r
chunkyN n = prod_loop where
prod_loop p = do mbs <- lift $ next p
case mbs of Left r -> return r
Right (bs, p') -> do bs_loop bs
prod_loop p'
bs_loop bs = unless (S.null bs) $ do yield fore
unless (S.null aft) (bs_loop aft)
where (fore, aft) = S.splitAt n bs
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment