Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Haskell Web Spider example
import Control.Exception
import Control.Monad
import Control.Monad.IO.Class
import Data.ByteString.Lazy (ByteString)
import Data.ByteString.Lazy.UTF8 (toString)
import Data.Function
import Data.Enumerator
import Data.List
import Data.Maybe
import Data.Map (Map)
import Data.Set (Set)
import Network.HTTP.Enumerator
import Network.URI
import Text.HTML.TagSoup
import qualified Data.ByteString.Lazy as B
import qualified Data.Enumerator.List as EL
import qualified Data.Map as Map
import qualified Data.Set as Set
type Resource = (URL, Content)
type URL = String
type Content = ByteString
spider :: Int -> URL -> IO (Map URL [URL])
spider count url = go count [url] Map.empty where
go 0 _ m = return m
go _ [] m = return m
go c (url:urls) history
| url `Map.member` history = go c urls history
| otherwise = do
res <- fetchResource url
let links = findLinks res
queue = urls ++ links
go (c-1) queue (Map.insert url links history)
fetchResource :: URL -> IO Resource
fetchResource url = fmap ((,) url) $ simpleHttp url
findLinks :: Resource -> [URL]
findLinks (baseurl, c) = normalize . findHrefs $ tags where
tags = canonicalizeTags . parseTags . toString $ c
findHrefs = mapMaybe href
normalize = mapMaybe $ normalizeLink baseurl
normalizeLink :: URL -> URL -> Maybe URL
normalizeLink base url = toString $ join $ url `relTo` base where
relTo a b = liftM2 nonStrictRelativeTo (parseURI a) (parseURIReference b)
toString = fmap $ ($"") . uriToString id
href :: Tag String -> Maybe String
href (TagOpen "a" attrs) = case lookup "rel" attrs of
Just "nofollow" -> Nothing
_ -> lookup "href" attrs
href _ = Nothing
---- Improved version using enumerator and iteratee
spiderEnum :: MonadIO m => URL -> Enumerator Resource m b
spiderEnum url = go [url] Set.empty where
go [] _ step = returnI step
go (url:urls) visited step
| url `Set.member` visited = go urls visited step
| otherwise = case step of
Continue k -> do
res <- liftIO $ fetchResource url
let links = findLinks res
cont = go (urls ++ links) (Set.insert url visited)
k (Chunks [res]) >>== cont
_ -> returnI step
printURLs :: Iteratee Resource IO ()
printURLs = EL.mapM_ $ putStrLn . fst
main = run_ task where
task = spiderEnum "http://leonidasoy.fi" $$ EL.isolate 10 $$ printURLs
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.