Created
February 7, 2010 01:35
-
-
Save szastupov/297119 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE OverloadedStrings #-} | |
import System.IO | |
import Network.Curl | |
import Network.URI | |
import Text.HTML.TagSoup | |
import Text.Printf | |
import Data.Graph.Inductive | |
import qualified Data.Map as M | |
import qualified Data.Set as S | |
import qualified Data.ByteString.Lazy as BS | |
import Data.ByteString.Lazy (ByteString) | |
import Data.String | |
import Data.Maybe | |
import Data.Word | |
import Data.Char | |
import Control.Monad.State | |
type WebGraph = Gr String () | |
type UrlMap = M.Map String Int | |
data Web = Web { graph :: WebGraph | |
, umap :: UrlMap | |
, visited :: S.Set String | |
, count :: Int | |
} | |
deriving Show | |
type WebState a = State Web a | |
emptyWeb = Web { graph = empty | |
, umap = M.empty | |
, visited = S.empty | |
, count = 0 | |
} | |
newUrl url idx st = | |
st { graph = insNode (idx, url) (graph st) | |
, umap = M.insert url idx (umap st) | |
, count = idx+1 | |
} | |
mapUrl :: String -> WebState Node | |
mapUrl url = do | |
st <- get | |
case M.lookup url (umap st) of | |
Just i -> return i | |
Nothing -> do | |
let idx = (count st) | |
modify (newUrl url idx) | |
return idx | |
isVisited url web = url `S.member` (visited web) | |
markVisited url web = web { visited = S.insert url (visited web) } | |
linksFrom url links = do | |
modify (markVisited url) | |
self <- mapUrl url | |
indeces <- mapM mapUrl links | |
st <- get | |
let link i = insEdge (self, i, ()) | |
put $ st { graph = foldr link (graph st) indeces } | |
getLinks :: ByteString -> [String] | |
getLinks txt = | |
map (map (chr . fromIntegral) . BS.unpack) $ | |
[link | TagOpen "a" atts <- parseTags txt | |
, ("href", link) <- atts] | |
-- Ugly! | |
urljoin parent url = | |
if isRelativeReference url | |
then fromJust $ do | |
p <- parseURIReference parent | |
u <- parseURIReference url | |
r <- relativeTo u p | |
return $ uriToString id r $ "" | |
else url | |
crawl :: Int -> Web -> String -> IO Web | |
crawl 0 web _ = return web | |
crawl d web url | |
| isVisited url web = return web | |
| otherwise = do | |
hPrintf stderr "depth %d, url %s\n" d url | |
(ret, txt) <- curlGetString_ url [CurlProxy "127.0.0.1:8123"] | |
case ret of | |
CurlOK -> let links = map (urljoin url) $ getLinks txt | |
web' = execState (linksFrom url links) web | |
in foldM (crawl (d-1)) web' links | |
_ -> return web | |
gv web = putStrLn $ graphviz (graph web) "Web" (8.5, 11) (1, 1) Landscape | |
test = withCurlDo $ do | |
web <- crawl 2 emptyWeb "http://kiwitobes.com/wiki/0.html" | |
return web | |
main = test >>= gv |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment