Skip to content

Instantly share code, notes, and snippets.

@unhammer

unhammer/SWC.hs Secret

Created May 27, 2022 10:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save unhammer/4fc1e7c72bf5251663eb7455bea6b897 to your computer and use it in GitHub Desktop.
Save unhammer/4fc1e7c72bf5251663eb7455bea6b897 to your computer and use it in GitHub Desktop.
streamly wc, 2x faster than getcontents→list→hashmap→list, 2x slower than awk
-- Based on
-- https://github.com/composewell/streamly-examples/blob/master/examples/WordFrequency.hs
-- but I don't seem to have classifyMutWith so explicitly making a HashMap of IORefs
import Data.Char (isSpace)
import Data.Foldable (traverse_)
import Data.Function ((&))
import System.Environment (getArgs)
import Data.IORef (newIORef, readIORef, modifyIORef')
import qualified Data.Char as Char
import qualified Data.HashMap.Strict as Map
import qualified Data.List as List
import qualified Data.Ord as Ord
import qualified Streamly.Data.Fold as Fold
import qualified Streamly.Internal.FileSystem.File as File (toBytes)
import qualified Streamly.Prelude as Stream
import qualified Streamly.Unicode.Stream as Unicode
main :: IO ()
main = do
inFile <- fmap head getArgs
-- Write the stream to a hashmap consisting of word counts
mp <-
let
alter Nothing = Just <$> newIORef (1 :: Int)
alter (Just ref) = modifyIORef' ref (+ 1) >> return (Just ref)
in File.toBytes inFile -- SerialT IO Word8
& Unicode.decodeUtf8 -- SerialT IO Char
& Stream.map toLower -- SerialT IO Char
& Stream.wordsBy isSpace Fold.toList -- SerialT IO String
& Stream.filter (all isAlpha) -- SerialT IO String
& Stream.foldlM' (flip (Map.alterF alter)) (return Map.empty) -- IO (Map String (IORef Int))
-- Print the top hashmap entries
counts <-
let readRef (w, ref) = do
cnt <- readIORef ref
return (w, cnt)
in Map.toList mp
& mapM readRef
traverse_ print $ List.sortOn (Ord.Down . snd) counts
& List.take 10
-- These make it slightly faster than with Data.Char's versions:
{-# INLINE toLower #-}
toLower :: Char -> Char
toLower c
| uc >= 0x61 && uc <= 0x7a = c
| otherwise = Char.toLower c
where
uc = fromIntegral (Char.ord c) :: Word
{-# INLINE isAlpha #-}
isAlpha :: Char -> Bool
isAlpha c
| uc >= 0x61 && uc <= 0x7a = True
| otherwise = Char.isAlpha c
where
uc = fromIntegral (Char.ord c) :: Word
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment