-
-
Save unhammer/4fc1e7c72bf5251663eb7455bea6b897 to your computer and use it in GitHub Desktop.
streamly wc, 2x faster than getcontents→list→hashmap→list, 2x slower than awk
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Based on | |
-- https://github.com/composewell/streamly-examples/blob/master/examples/WordFrequency.hs | |
-- but I don't seem to have classifyMutWith so explicitly making a HashMap of IORefs | |
import Data.Char (isSpace) | |
import Data.Foldable (traverse_) | |
import Data.Function ((&)) | |
import System.Environment (getArgs) | |
import Data.IORef (newIORef, readIORef, modifyIORef') | |
import qualified Data.Char as Char | |
import qualified Data.HashMap.Strict as Map | |
import qualified Data.List as List | |
import qualified Data.Ord as Ord | |
import qualified Streamly.Data.Fold as Fold | |
import qualified Streamly.Internal.FileSystem.File as File (toBytes) | |
import qualified Streamly.Prelude as Stream | |
import qualified Streamly.Unicode.Stream as Unicode | |
main :: IO () | |
main = do | |
inFile <- fmap head getArgs | |
-- Write the stream to a hashmap consisting of word counts | |
mp <- | |
let | |
alter Nothing = Just <$> newIORef (1 :: Int) | |
alter (Just ref) = modifyIORef' ref (+ 1) >> return (Just ref) | |
in File.toBytes inFile -- SerialT IO Word8 | |
& Unicode.decodeUtf8 -- SerialT IO Char | |
& Stream.map toLower -- SerialT IO Char | |
& Stream.wordsBy isSpace Fold.toList -- SerialT IO String | |
& Stream.filter (all isAlpha) -- SerialT IO String | |
& Stream.foldlM' (flip (Map.alterF alter)) (return Map.empty) -- IO (Map String (IORef Int)) | |
-- Print the top hashmap entries | |
counts <- | |
let readRef (w, ref) = do | |
cnt <- readIORef ref | |
return (w, cnt) | |
in Map.toList mp | |
& mapM readRef | |
traverse_ print $ List.sortOn (Ord.Down . snd) counts | |
& List.take 10 | |
-- These make it slightly faster than with Data.Char's versions: | |
{-# INLINE toLower #-} | |
toLower :: Char -> Char | |
toLower c | |
| uc >= 0x61 && uc <= 0x7a = c | |
| otherwise = Char.toLower c | |
where | |
uc = fromIntegral (Char.ord c) :: Word | |
{-# INLINE isAlpha #-} | |
isAlpha :: Char -> Bool | |
isAlpha c | |
| uc >= 0x61 && uc <= 0x7a = True | |
| otherwise = Char.isAlpha c | |
where | |
uc = fromIntegral (Char.ord c) :: Word | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment