public
Created

Fixed wordfreq solutions to use text/bytestring

  • Download Gist
wordfreq-bs.hs
Haskell
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
{-# LANGUAGE BangPatterns #-}
module Main where
 
import Prelude
import Data.ByteString as B
import Data.ByteString.Char8 as BC8
import System.Environment (getArgs)
import Control.Arrow
 
import Data.List (sortBy)
import Data.Char (isLetter)
import qualified Data.HashMap.Strict as HM
import Data.Ord (comparing)
 
-- | equivalent to a-zA-Z ranges in D, doesn't handle unicode data (unlike Data.Char.isLetter)
isLetterFast c | (c >= 97) && (c <= 122) = True
| (c >= 65) && (c <= 90) = True
| otherwise = False
 
-- | toLower equivalent for word8
toLowerWord8 c | (c >= 65) && (c <= 90) = c+32
| otherwise = c
 
replaceNonLetter c | isLetterFast c = c
| otherwise = 32 -- ' '
 
createReport :: Int -> ByteString -> String
createReport n text =
Prelude.unlines $
Prelude.map (\(w, count) -> (BC8.unpack w) ++ " " ++ show count) $
Prelude.take n $
sortBy (flip $ comparing snd) $
HM.toList $
HM.fromListWith (\ !old !new -> old+new) $
Prelude.map (\w -> (w, 1)) $
BC8.words $
B.map (replaceNonLetter . toLowerWord8) $
text
 
 
main = do
[fileName, nstr] <- getArgs
let n = read nstr :: Int
text <- BC8.readFile fileName
Prelude.putStr $ createReport n text
return ()
wordfreq-text.hs
Haskell
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
{-# LANGUAGE BangPatterns #-}
module Main where
 
import Prelude
import Data.Text as T
import Data.Text.IO as T
import System.Environment (getArgs)
import Control.Arrow
 
import Data.List (sortBy)
import Data.Char (isLetter)
import qualified Data.HashMap.Strict as HM
import Data.Ord (comparing)
 
replaceNonLetter c | isLetter c = c
| otherwise = ' '
 
createReport :: Int -> Text -> String
createReport n text =
Prelude.unlines $
Prelude.map (\(w, count) -> (T.unpack w) ++ " " ++ show count) $
Prelude.take n $
sortBy (flip $ comparing snd) $
HM.toList $
HM.fromListWith (\ !old !new -> old+new) $
Prelude.map (\w -> (w, 1)) $
T.words $
T.map replaceNonLetter $
T.toLower $
text
 
main = do
[fileName, nstr] <- getArgs
let n = read nstr :: Int
text <- T.readFile fileName
Prelude.putStr $ createReport n text
return ()

For what it's worth, I finished the ByteStringing of your ByteString module; it seems a bit more that twice as fast that way. https://gist.github.com/anonymous/5850805 I made an attempt to avoid the Data.Char functions for Text, which seemed to give a 25-30% speedup, at the cost of silly boilerplate.

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.