Skip to content

Instantly share code, notes, and snippets.

@rampion
Created December 7, 2012 23:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rampion/4237357 to your computer and use it in GitHub Desktop.
Save rampion/4237357 to your computer and use it in GitHub Desktop.
Details for SO question 13735640
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
module Main where
import Control.Applicative ((<$>), (<*>), (<|>))
import Control.Monad (forM_, (>>), return)
import Control.Monad.Trans (lift)
import Data.ByteString.Lazy.Char8 (readFile, count)
import Data.HashMap (empty, insertWith, assocs, Map, size)
import Data.Hashable (Hashable)
import Data.List (foldl')
import Data.Text (unpack, pack, Text)
import Prelude (show, Show, ($), (.), (++), div, (*), (-), Char, Int, replicate, Eq, Ord, concat, read, length, map, const, flip, Either(..), fromIntegral)
import System.Environment (getArgs)
import System.Exit (exitFailure)
import System.IO (putStrLn, hPutStr, hPutStrLn, stderr, IO)
import Text.Parsec (char, eof, many, many1, noneOf, digit, string, ParsecT, Stream, runParserT)
import Text.Parsec.Prim (modifyState, getState, getPosition)
import Text.Parsec.Pos (sourceLine)
newtype Url = Url { fromUrl :: Text } deriving (Eq, Ord, Hashable)
newtype Category = Category { fromCategory :: Text } deriving (Eq, Ord)
data RelativeRank = RelativeRank { getIndex :: !Int, getTotal :: !Int }
instance Show Url where
show = unpack . fromUrl
instance Show Category where
show = unpack . fromCategory
instance Show RelativeRank where
show (RelativeRank index total) = "(" ++ show index ++ "/" ++ show total ++ ")"
-- a category is a `/` delimited path on a line alone
category :: Stream s m Char => ParsecT s u m Category
category = do
let section = (:) <$> char '/' <*> many1 (noneOf "/\n")
cat <- many1 section
char '\n'
return . Category . pack . concat $ cat
-- the urls for each category are given as "RANK. URL\n"
rankedUrl :: Stream s m Char => ParsecT s u m (Int -> (Url, RelativeRank))
rankedUrl = do
index <- many1 digit
string ". "
url <- many1 $ noneOf "\n"
char '\n'
return $ (Url $ pack url,) . RelativeRank (read index)
-- a full listing including the category and its ranked list of urls
categoryListing :: Stream s m Char => ParsecT s u m (Category, [(Url, RelativeRank)])
categoryListing = do
cat <- category
pairs <- many rankedUrl
let tot = length pairs
return $ (cat, map ($ tot) pairs)
transposeCategoryListing :: Stream s m Char => ParsecT s (Map Url [(Category,RelativeRank)]) m ()
transposeCategoryListing = do
(cat, pairs) <- categoryListing
let include !m (url,rank) = let pair = (cat,rank) in insertWith (const (pair:)) url [pair] m
modifyState $ foldl' include `flip` pairs
printProgress :: Stream s IO Char => Int -> ParsecT s (Map a b) IO ()
printProgress numLines = do
lineNo <- sourceLine <$> getPosition
let perc = (100 * lineNo) `div` numLines
let spac = 100 - perc
let bar = "[" ++ replicate perc '*' ++ replicate (100-perc) ' ' ++ "]"
count <- size <$> getState
let stat = show lineNo ++ "/" ++ show numLines ++ " lines, " ++ show count ++ " unique urls"
lift . hPutStr stderr $ "\r" ++ bar ++ " " ++ stat
loadTranspose :: (Stream s IO Char) => Int -> ParsecT s (Map Url [(Category,RelativeRank)]) IO [(Url,[(Category,RelativeRank)])]
loadTranspose numLines = do
printProgress numLines
many1 $ transposeCategoryListing >> printProgress numLines
eof
assocs <$> getState
main :: IO ()
main = do
[inputFile] <- getArgs
contents <- readFile inputFile
let numLines = fromIntegral $ count '\n' contents
result <- runParserT (loadTranspose numLines) empty inputFile contents
hPutStr stderr "\n"
case result of
Left err -> do
hPutStrLn stderr $ "Error: " ++ show err
exitFailure
Right xs -> do
forM_ xs $ \(url, ys) -> do
putStrLn $ show url
forM_ ys $ \(cat, rank) ->
putStrLn $ show rank ++ " " ++ show cat
Fri Dec 7 16:32 2012 Time and Allocation Profiling Report (Final)
AlexaTranspose +RTS -p -sstderr -RTS alexa_scrape.txt
total time = 8591.53 secs (8591534 ticks @ 1000 us, 1 processor)
total alloc = 41,104,762,684 bytes (excludes profiling overheads)
COST CENTRE MODULE %time %alloc
printProgress Main 97.0 16.9
transposeCategoryListing.include Main 2.1 3.1
rankedUrl Main 0.7 64.4
category.section Main 0.2 11.5
unstream/resize Data.Text.Fusion 0.0 2.1
individual inherited
COST CENTRE MODULE no. entries %time %alloc %time %alloc
MAIN MAIN 121 0 0.0 0.0 100.0 100.0
main Main 243 0 0.0 0.7 100.0 100.0
main.numLines Main 248 1 0.0 0.0 0.0 0.0
loadTranspose Main 244 1 0.0 0.0 100.0 99.3
printProgress Main 245 1 97.0 16.9 100.0 99.3
transposeCategoryListing Main 253 0 0.0 0.0 3.0 81.3
categoryListing Main 255 0 0.0 0.0 3.0 81.3
category Main 257 0 0.0 0.1 3.0 81.2
category.section Main 259 0 0.2 11.5 3.0 81.1
rankedUrl Main 266 0 0.7 64.4 2.8 69.7
unstream/resize Data.Text.Fusion 270 5474029 0.0 2.1 0.0 2.1
transposeCategoryListing.include Main 269 2421858 2.1 3.1 2.1 3.1
printProgress.stat Main 249 132479 0.0 0.4 0.0 0.4
printProgress.perc Main 247 132479 0.0 0.0 0.0 0.0
printProgress.bar Main 246 132479 0.0 0.8 0.0 0.8
CAF:main1 Main 240 0 0.0 0.0 0.0 0.0
main Main 242 1 0.0 0.0 0.0 0.0
CAF:lvl24_r5Ls Main 236 0 0.0 0.0 0.0 0.0
printProgress Main 250 0 0.0 0.0 0.0 0.0
printProgress.stat Main 251 0 0.0 0.0 0.0 0.0
CAF:transposeCategoryListing1 Main 234 0 0.0 0.0 0.0 0.0
transposeCategoryListing Main 252 1 0.0 0.0 0.0 0.0
CAF:categoryListing1 Main 228 0 0.0 0.0 0.0 0.0
categoryListing Main 254 1 0.0 0.0 0.0 0.0
CAF:rankedUrl1 Main 224 0 0.0 0.0 0.0 0.0
rankedUrl Main 265 1 0.0 0.0 0.0 0.0
CAF:lvl14_r5Lb Main 222 0 0.0 0.0 0.0 0.0
rankedUrl Main 268 0 0.0 0.0 0.0 0.0
CAF:lvl11_r5L7 Main 221 0 0.0 0.0 0.0 0.0
rankedUrl Main 267 0 0.0 0.0 0.0 0.0
CAF:category1 Main 217 0 0.0 0.0 0.0 0.0
category Main 256 1 0.0 0.0 0.0 0.0
CAF:lvl5_r5KW Main 214 0 0.0 0.0 0.0 0.0
category Main 264 0 0.0 0.0 0.0 0.0
CAF:a1_r5KU Main 213 0 0.0 0.0 0.0 0.0
category.section Main 258 1 0.0 0.0 0.0 0.0
CAF:w_r5KP Main 212 0 0.0 0.0 0.0 0.0
category Main 262 0 0.0 0.0 0.0 0.0
category.section Main 263 0 0.0 0.0 0.0 0.0
CAF:lvl_r5KN Main 211 0 0.0 0.0 0.0 0.0
category Main 260 0 0.0 0.0 0.0 0.0
category.section Main 261 0 0.0 0.0 0.0 0.0
CAF:$dStream_r5KM Main 210 0 0.0 0.0 0.0 0.0
CAF GHC.Conc.Signal 170 0 0.0 0.0 0.0 0.0
CAF GHC.IO.Encoding 163 0 0.0 0.0 0.0 0.0
CAF GHC.IO.Encoding.Iconv 161 0 0.0 0.0 0.0 0.0
CAF GHC.IO.FD 156 0 0.0 0.0 0.0 0.0
CAF GHC.IO.Handle.FD 154 0 0.0 0.0 0.0 0.0
CAF GHC.TopHandler 147 0 0.0 0.0 0.0 0.0
% ghc --make AlexaTranspose.hs -O2 -rtsopts -prof -auto-all -caf-all
[1 of 1] Compiling Main ( AlexaTranspose.hs, AlexaTranspose.o )
Linking AlexaTranspose ...
% time ./AlexaTranspose alexa_scrape.txt +RTS -p -sstderr -RTS >! alexa_invert.txt
[************************* ] 2554337/9871659 lines, 878489 unique urls
^C
66,271,808,844 bytes allocated in the heap
8,679,553,312 bytes copied during GC
1,358,659,700 bytes maximum residency (14 sample(s))
45,883,200 bytes maximum slop
3045 MB total memory in use (0 MB lost due to fragmentation)
Tot time (elapsed) Avg pause Max pause
Gen 0 127916 colls, 0 par 75.42s 191.38s 0.0015s 0.8661s
Gen 1 14 colls, 0 par 52.06s 6119.69s 437.1206s 4759.2361s
INIT time 0.00s ( 0.00s elapsed)
MUT time 8145.15s (8604.28s elapsed)
GC time 127.48s (6311.06s elapsed)
RP time 0.00s ( 0.00s elapsed)
PROF time 0.00s ( 0.00s elapsed)
EXIT time 0.00s ( 0.01s elapsed)
Total time 8272.63s (14915.36s elapsed)
%GC time 1.5% (42.3% elapsed)
Alloc rate 8,136,354 bytes per MUT second
Productivity 98.5% of total user, 54.6% of total elapsed
./AlexaTranspose alexa_scrape.txt +RTS -p -sstderr -RTS >| alexa_invert.txt 8272.63s user 406.75s system 58% cpu 4:08:36.02 total
%
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment