Skip to content

Instantly share code, notes, and snippets.

@Codas
Created May 1, 2015 10:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Codas/894694eea247aaacf35f to your computer and use it in GitHub Desktop.
Save Codas/894694eea247aaacf35f to your computer and use it in GitHub Desktop.
sum csv
On a 850MB csv file
time sum-haskell big-data.csv
1500352.4655644759,1500860.0948624783,...
10.61 real 10.24 user 0.35 sys
time ./sum-ocaml big-data.csv
1500352.46556,1500860.09486,1499634....
17.71 real 17.30 user 0.33 sys
time python2.7 sum-python.py big-data.csv
1500352.46556,1500860.09486,...
38.86 real 38.36 user 0.40 sys
time pypy sum-python.py big-data.csv
1500352.46556,1500860.09486,...
28.52 real 27.69 user 0.48 sys
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS
import Data.Maybe (fromJust)
import qualified Data.ByteString.Lex.Double as BS
import Control.Monad (foldM_)
import Conduit hiding (sourceFile)
import Data.Conduit.Binary
import Data.Vector.Unboxed (Vector)
import qualified Data.Vector.Unboxed as V
import qualified Data.Vector.Unboxed.Mutable as MV
import System.Environment
import Prelude hiding (lines)
display :: Vector Double -> String
display ds
| V.null ds = ""
| otherwise = tail . concatMap ((',' :) . show) . V.toList $ ds
sumLines :: Vector Double -> ByteString -> Vector Double
sumLines v bs
| V.null v = V.fromList doubles
| otherwise = V.create $ do v' <- MV.new (V.length v)
foldM_ (foldDoubles v') 0 doubles
return v'
where doubles = map (fst . fromJust . BS.unsafeReadDouble) $ BS.split 44 {- ','-} bs
foldDoubles v' i d = MV.write v' i (d + (v V.! i)) >> return (i + 1)
main :: IO ()
main = do
paths <- getArgs
case paths of
(filePath:_) ->
do sums <- runResourceT $ sourceFile filePath $= lines $$ foldlC sumLines mempty
putStrLn (display sums)
_ -> error "Exactly one filename must be given."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment