Skip to content

@Tener /wordfreq-bs.hs
Created

Embed URL

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Fixed wordfreq solutions to use text/bytestring
{-# LANGUAGE BangPatterns #-}
module Main where
import Prelude
import Data.ByteString as B
import Data.ByteString.Char8 as BC8
import System.Environment (getArgs)
import Control.Arrow
import Data.List (sortBy)
import Data.Char (isLetter)
import qualified Data.HashMap.Strict as HM
import Data.Ord (comparing)
-- | equivalent to a-zA-Z ranges in D, doesn't handle unicode data (unlike Data.Char.isLetter)
isLetterFast c | (c >= 97) && (c <= 122) = True
| (c >= 65) && (c <= 90) = True
| otherwise = False
-- | toLower equivalent for word8
toLowerWord8 c | (c >= 65) && (c <= 90) = c+32
| otherwise = c
replaceNonLetter c | isLetterFast c = c
| otherwise = 32 -- ' '
createReport :: Int -> ByteString -> String
createReport n text =
Prelude.unlines $
Prelude.map (\(w, count) -> (BC8.unpack w) ++ " " ++ show count) $
Prelude.take n $
sortBy (flip $ comparing snd) $
HM.toList $
HM.fromListWith (\ !old !new -> old+new) $
Prelude.map (\w -> (w, 1)) $
BC8.words $
B.map (replaceNonLetter . toLowerWord8) $
text
main = do
[fileName, nstr] <- getArgs
let n = read nstr :: Int
text <- BC8.readFile fileName
Prelude.putStr $ createReport n text
return ()
{-# LANGUAGE BangPatterns #-}
module Main where
import Prelude
import Data.Text as T
import Data.Text.IO as T
import System.Environment (getArgs)
import Control.Arrow
import Data.List (sortBy)
import Data.Char (isLetter)
import qualified Data.HashMap.Strict as HM
import Data.Ord (comparing)
replaceNonLetter c | isLetter c = c
| otherwise = ' '
createReport :: Int -> Text -> String
createReport n text =
Prelude.unlines $
Prelude.map (\(w, count) -> (T.unpack w) ++ " " ++ show count) $
Prelude.take n $
sortBy (flip $ comparing snd) $
HM.toList $
HM.fromListWith (\ !old !new -> old+new) $
Prelude.map (\w -> (w, 1)) $
T.words $
T.map replaceNonLetter $
T.toLower $
text
main = do
[fileName, nstr] <- getArgs
let n = read nstr :: Int
text <- T.readFile fileName
Prelude.putStr $ createReport n text
return ()
@michaelt

For what it's worth, I finished the ByteStringing of your ByteString module; it seems a bit more that twice as fast that way. https://gist.github.com/anonymous/5850805 I made an attempt to avoid the Data.Char functions for Text, which seemed to give a 25-30% speedup, at the cost of silly boilerplate.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.