Skip to content

Instantly share code, notes, and snippets.

@ivant
Last active December 31, 2015 14:48
Show Gist options
  • Save ivant/8002288 to your computer and use it in GitHub Desktop.
Save ivant/8002288 to your computer and use it in GitHub Desktop.
Build a sorted word frequency list from a file, trimmed to a given quantile.
-- Build a sorted word frequency list from a file, trimmed to a given quantile.
--
-- Usage: WordStats <book.txt> <quantile>
--
-- `quantile` is a number between 0 and 1.
--
-- Example:
-- ./WordStats "Don Quijote.txt" 0.85 > "Don Quijote.words.85"
import Control.Applicative
import Control.Monad (forM_)
import Data.ByteString (ByteString)
import qualified Data.ByteString as B
import Data.Char (isAlpha)
import Data.HashMap.Strict (HashMap)
import qualified Data.HashMap.Strict as M
import Data.List (sortBy)
import Data.Ord (comparing, Down(..))
import Data.Ratio
import qualified Data.Text as T
import qualified Data.Text.Encoding as E
import System.Environment (getArgs)
import Text.Printf
getWords :: T.Text -> [T.Text]
getWords text = [ noPunctuationWord
| word <- T.toLower <$> T.words text
, let noPunctuationWord = T.filter isAlpha word
, T.length noPunctuationWord > 0
]
wordMap :: [T.Text] -> HashMap T.Text Int
wordMap words = M.fromListWith (+) (zip words [1,1..])
main = do
args <- getArgs
let file = args !! 0
percentile = read $ args !! 1 :: Double
text <- E.decodeUtf8 <$> B.readFile file
let words = getWords text
wm = wordMap words
freqList = sortBy (comparing (Down . snd)) (M.toList wm) :: [(T.Text, Int)]
totalWords = fromIntegral $ sum $ map snd freqList :: Integer
accumFreqList = let (ws,cs) = unzip freqList
in zip3 ws cs (scanl1 (+) cs)
percentileWords = takeWhile (\(_,_,a) -> (fromRational $ fromIntegral a % totalWords) <= percentile) accumFreqList
forM_ percentileWords $ \(w,c,a) -> do
printf "%d %s\n" c (T.unpack w)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment