Skip to content

Instantly share code, notes, and snippets.

@szastupov
Created February 7, 2010 01:35
Show Gist options
  • Save szastupov/297119 to your computer and use it in GitHub Desktop.
Save szastupov/297119 to your computer and use it in GitHub Desktop.
{-# LANGUAGE OverloadedStrings #-}
import System.IO
import Network.Curl
import Network.URI
import Text.HTML.TagSoup
import Text.Printf
import Data.Graph.Inductive
import qualified Data.Map as M
import qualified Data.Set as S
import qualified Data.ByteString.Lazy as BS
import Data.ByteString.Lazy (ByteString)
import Data.String
import Data.Maybe
import Data.Word
import Data.Char
import Control.Monad.State
type WebGraph = Gr String ()
type UrlMap = M.Map String Int
data Web = Web { graph :: WebGraph
, umap :: UrlMap
, visited :: S.Set String
, count :: Int
}
deriving Show
type WebState a = State Web a
emptyWeb = Web { graph = empty
, umap = M.empty
, visited = S.empty
, count = 0
}
newUrl url idx st =
st { graph = insNode (idx, url) (graph st)
, umap = M.insert url idx (umap st)
, count = idx+1
}
mapUrl :: String -> WebState Node
mapUrl url = do
st <- get
case M.lookup url (umap st) of
Just i -> return i
Nothing -> do
let idx = (count st)
modify (newUrl url idx)
return idx
isVisited url web = url `S.member` (visited web)
markVisited url web = web { visited = S.insert url (visited web) }
linksFrom url links = do
modify (markVisited url)
self <- mapUrl url
indeces <- mapM mapUrl links
st <- get
let link i = insEdge (self, i, ())
put $ st { graph = foldr link (graph st) indeces }
getLinks :: ByteString -> [String]
getLinks txt =
map (map (chr . fromIntegral) . BS.unpack) $
[link | TagOpen "a" atts <- parseTags txt
, ("href", link) <- atts]
-- Ugly!
urljoin parent url =
if isRelativeReference url
then fromJust $ do
p <- parseURIReference parent
u <- parseURIReference url
r <- relativeTo u p
return $ uriToString id r $ ""
else url
crawl :: Int -> Web -> String -> IO Web
crawl 0 web _ = return web
crawl d web url
| isVisited url web = return web
| otherwise = do
hPrintf stderr "depth %d, url %s\n" d url
(ret, txt) <- curlGetString_ url [CurlProxy "127.0.0.1:8123"]
case ret of
CurlOK -> let links = map (urljoin url) $ getLinks txt
web' = execState (linksFrom url links) web
in foldM (crawl (d-1)) web' links
_ -> return web
gv web = putStrLn $ graphviz (graph web) "Web" (8.5, 11) (1, 1) Landscape
test = withCurlDo $ do
web <- crawl 2 emptyWeb "http://kiwitobes.com/wiki/0.html"
return web
main = test >>= gv
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment