rodrigoalviani/crawler haskell

## crawler haskell
-- | URL Doc : http://hackage.haskell.org/package/url-2.1.3/docs/Network-URL.html
module Page where

import Network.URL
import Network.Curl
import Text.XML.HXT.Core
import Text.HandsomeSoup

data Page = Page {
  title :: String,
  content :: String,
  url :: Maybe URL,
  stat :: Stat
} deriving Show

data Stat = Stat {
  dwltime :: String
} deriving Show

class Indexable i where
  rank:: i -> Integer
  backlinks :: [i] -> Integer


html2title :: String -> IO String
html2title h = (runX $ doc >>> css "title" /> getText) >>= return . getTitle
               where
                 doc = readString [withParseHTML yes, withWarnings no] h
                 getTitle t = if null t then "" else head t

crawlurl :: String -> IO Page
crawlurl u = do
  (curlGetResponse_ (u) [CurlFollowLocation True] :: IO CurlResponse) >>= (\r-> (html2title (respBody r)) >>= (\t-> return $ Page t (respBody r) (importURL u) (Stat "0")))
	-- \| URL Doc : http://hackage.haskell.org/package/url-2.1.3/docs/Network-URL.html
	module Page where

	import Network.URL
	import Network.Curl
	import Text.XML.HXT.Core
	import Text.HandsomeSoup

	data Page = Page {
	title :: String,
	content :: String,
	url :: Maybe URL,
	stat :: Stat
	} deriving Show

	data Stat = Stat {
	dwltime :: String
	} deriving Show

	class Indexable i where
	rank:: i -> Integer
	backlinks :: [i] -> Integer


	html2title :: String -> IO String
	html2title h = (runX $ doc >>> css "title" /> getText) >>= return . getTitle
	where
	doc = readString [withParseHTML yes, withWarnings no] h
	getTitle t = if null t then "" else head t

	crawlurl :: String -> IO Page
	crawlurl u = do
	(curlGetResponse_ (u) [CurlFollowLocation True] :: IO CurlResponse) >>= (\r-> (html2title (respBody r)) >>= (\t-> return $ Page t (respBody r) (importURL u) (Stat "0")))