Skip to content

Instantly share code, notes, and snippets.

@ndmitchell
Created July 4, 2016 21:31
Show Gist options
  • Save ndmitchell/6828df5eeca3776b32a0420c73dc95ed to your computer and use it in GitHub Desktop.
Save ndmitchell/6828df5eeca3776b32a0420c73dc95ed to your computer and use it in GitHub Desktop.
{-# LANGUAGE Rank2Types #-}
import qualified Core.IO as F
import qualified Core.String as F
import qualified Core.Array as F
import qualified Core.VFS as F
import qualified Data.ByteString.Lazy as LBS
import qualified Data.ByteString as BS
import Codec.Compression.GZip as GZip
import Codec.Archive.Tar as Tar
import Control.Monad
import Data.Conduit
import qualified Data.Conduit.List as C
import qualified Core.Collection as F
import qualified Core as F
main = do
let file = "C:/Users/ndmit_000/AppData/Roaming/hoogle/input-haskell-hoogle.tar.gz"
xs <- tarballReadFiles file
forM_ xs $ \(file, content) -> do
print file
i <- runConduit $ unpackLBS content =$= linesC =$= count
print (file, i)
unpackLBS :: Monad m => LBS.ByteString -> Producer m F.ByteArray
unpackLBS = C.sourceList . map f . LBS.toChunks
where f = F.fromList . BS.unpack
linesC :: Monad m => Conduit F.ByteArray m F.String
linesC = loop []
where
loop acc = await >>= maybe (finish acc) (go acc)
finish acc = case F.fromBytes F.UTF8 final of
_ | F.null final -> return ()
(s, z) | not $ F.null z -> error "Trailing garbage at the end"
| otherwise -> yield s
where final = mconcat $ reverse acc
go acc more = case f_uncons second of
Just (_, second') ->
let (s, end) = F.fromBytes F.UTF8 $ mconcat $ reverse $ first:acc
in yield s >> go [end] second'
Nothing -> loop $ more:acc
where (first, second) = F.break (== fromIntegral (fromEnum '\n')) more
f_uncons :: F.ByteArray -> Maybe (F.Word8, F.ByteArray)
f_uncons x = case x F.! 0 of
Nothing -> Nothing
Just c -> Just (c, F.drop 1 x)
count :: Monad m => Consumer a m Int
count = C.map (const 1) =$= C.fold (+) 0
tarballReadFiles :: FilePath -> IO [(FilePath, LBS.ByteString)]
tarballReadFiles file = f . Tar.read . GZip.decompress <$> LBS.readFile file
where
f (Next e rest) | NormalFile body _ <- entryContent e = (entryPath e, body) : f rest
f (Next _ rest) = f rest
f Done = []
f (Fail e) = error $ "tarballReadFiles on " ++ file ++ ", " ++ show e
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment