Skip to content

Instantly share code, notes, and snippets.

@merijn
Created Jul 9, 2015
Embed
What would you like to do?
Concurrent webscraper
{-# LANGUAGE OverloadedStrings #-}
import Control.Concurrent.Async
import Control.Concurrent.QSem
import Control.Monad
import Control.Monad.Catch
import Control.Monad.Trans
import Control.Monad.Reader
import Data.ByteString.Lazy (ByteString)
import qualified Data.ByteString.Lazy as LBS
import Data.Foldable
import Data.Monoid
import Data.Text (Text)
import qualified Data.Text as T
import GHC.Conc.Sync
import Network (withSocketsDo)
import Network.URI
import Network.HTTP.Conduit hiding (withManager)
import Text.XML.Cursor
import Text.HTML.DOM
import System.FilePath
instance MonadIO Concurrently where
liftIO = Concurrently
type Scraper = ReaderT (QSem, Manager) Concurrently
urls :: [Text]
urls = take 1 [baseURL <> T.pack (show x) | x <- [(1::Int)..51]]
where
baseURL = "http://example.com/base/url"
fetch :: Text -> Scraper ByteString
fetch url = do
(qsem, manager) <- ask
src <- liftIO $ parseUrl (T.unpack url)
liftIO . bracket_ (waitQSem qsem) (signalQSem qsem) $
responseBody <$> httpLbs src manager
runScraper :: Int -> Scraper a -> IO a
runScraper count scraper = do
qsem <- newQSem count
bracket (newManager conduitManagerSettings) closeManager $ \manager ->
runConcurrently $ runReaderT scraper (qsem, manager)
grabComic :: Text -> Scraper ()
grabComic url = do
cursor <- fromDocument . parseLBS <$> fetch url
let [comicSrc] = cursor
$// element "div"
>=> attributeIs "id" "comic"
>=> descendant
>=> element "img"
>=> attribute "src"
Just comicURI = parseURI . T.unpack $ comicSrc
outputFile = takeFileName $ uriPath comicURI
fetch comicSrc >>= liftIO . LBS.writeFile outputFile
crawlArchive :: Text -> Scraper ()
crawlArchive url = do
cursor <- fromDocument . parseLBS <$> fetch url
let comicUrls = cursor
$// element "A"
>=> attribute "HREF"
liftIO $ print comicUrls
--traverse_ grabComic comicUrls
main :: IO ()
main = withSocketsDo $ do
getNumProcessors >>= setNumCapabilities
runScraper 100 $ crawlArchive "http://example.com/
@gregnwosu

This comment has been minimized.

Copy link

@gregnwosu gregnwosu commented Mar 24, 2016

lovely, and thanks greg` from #haskell

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment