public
Last active

Alexa scraper example from /r/haskell on reddit

  • Download Gist
Main.hs
Haskell
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE OverloadedStrings #-}
 
import Blaze.ByteString.Builder (Builder,
fromByteString)
import Blaze.ByteString.Builder.Char8 (fromChar, fromShow)
import Blaze.ByteString.Builder.Internal.Buffer (allocBuffer)
import Control.Applicative (pure, (<$>), (<*),
(<|>))
import Control.Monad (void)
import Data.Attoparsec.ByteString.Char8 (char, decimal,
endOfInput,
endOfLine, takeTill,
takeWhile)
import Data.ByteString.Char8 (ByteString)
import qualified Data.HashTable.IO as H
import Data.Maybe (fromMaybe)
import Data.Monoid (mconcat)
import Data.Vector (Vector)
import qualified Data.Vector as V
import Prelude hiding (takeWhile)
import System.IO.Streams (InputStream,
OutputStream)
import qualified System.IO.Streams as Streams
import qualified System.IO.Streams.Attoparsec as Streams
 
 
data OutputRecord = OutputRecord {
_category :: !ByteString
, _index :: {-# UNPACK #-} !Int
, _max :: {-# UNPACK #-} !Int
}
 
data HostRecord = HostRecord {
_host :: !ByteString
, _hostindex :: {-# UNPACK #-} !Int
}
 
type Url = ByteString
 
type HT = H.CuckooHashTable Url ([OutputRecord] -> [OutputRecord])
 
 
------------------------------------------------------------------------------
-- Input has format:
-- /category/foo
-- 0. host
-- 1. host
-- /category/next
-- ...
--
-- Given a stream, produce an InputStream that yields hosts from one of these
-- blocks until the next category is reached.
hostsStream :: InputStream ByteString -> IO (InputStream HostRecord)
hostsStream = Streams.parserToInputStream ((Just <$> parser) <|> pure Nothing)
where
parser = do
idx <- decimal
void $ char '.'
void $ takeWhile (== ' ')
host <- takeTill (== '\n')
void $ endOfLine
return $! HostRecord host idx
 
 
------------------------------------------------------------------------------
getCategory :: InputStream ByteString -> IO (Maybe ByteString)
getCategory = Streams.parseFromStream p
where
p = (endOfInput >> pure Nothing) <|>
(Just <$> takeWhile (/= '\n') <* endOfLine)
 
 
------------------------------------------------------------------------------
eatCategory :: InputStream ByteString
-> IO (Maybe (ByteString, Vector HostRecord))
eatCategory input = getCategory input >>=
maybe (return Nothing) ((Just <$>) . proc)
where
proc :: ByteString -> IO (ByteString, Vector HostRecord)
proc category = do
hosts <- hostsStream input >>= Streams.toVector
return (category, hosts)
 
 
------------------------------------------------------------------------------
procCategory :: InputStream ByteString
-> HT
-> IO Bool
procCategory input ht = eatCategory input >>= maybe (return False) go
where
go (category, records) = V.mapM_ add records >> return True
where
add (HostRecord host idx) = do
l <- fromMaybe id <$> H.lookup ht host
let !r = OutputRecord category idx len
H.insert ht host (l . (r :))
 
len = V.length records
 
 
------------------------------------------------------------------------------
readInput :: InputStream ByteString -> HT -> IO ()
readInput input ht = go
where
go = do
b <- procCategory input ht
if b then go else return ()
 
 
------------------------------------------------------------------------------
writeOutput :: OutputStream Builder -> HT -> IO ()
writeOutput output ht = H.mapM_ writeOne ht >> Streams.write Nothing output
where
writeOne (host, records) = flip Streams.write output $! Just $!
mconcat $ [ fromByteString host,
fromChar '\n'
] ++ (map record $ records [])
record (OutputRecord category idx mx) =
mconcat [ fromChar '('
, fromShow $! (idx + 1)
, fromChar '/'
, fromShow mx
, fromByteString ") "
, fromByteString category
, fromChar '\n'
]
 
 
------------------------------------------------------------------------------
main :: IO ()
main = do
ht <- H.newSized 13000000
out <- Streams.unsafeBuilderStream (allocBuffer 65535) Streams.stdout
readInput Streams.stdin ht
writeOutput out ht

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.