Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
-- | URL Doc : http://hackage.haskell.org/package/url-2.1.3/docs/Network-URL.html
module Page where
import Network.URL
import Network.Curl
import Text.XML.HXT.Core
import Text.HandsomeSoup
data Page = Page {
title :: String,
content :: String,
url :: Maybe URL,
stat :: Stat
} deriving Show
data Stat = Stat {
dwltime :: String
} deriving Show
class Indexable i where
rank:: i -> Integer
backlinks :: [i] -> Integer
html2title :: String -> IO String
html2title h = (runX $ doc >>> css "title" /> getText) >>= return . getTitle
where
doc = readString [withParseHTML yes, withWarnings no] h
getTitle t = if null t then "" else head t
crawlurl :: String -> IO Page
crawlurl u = do
(curlGetResponse_ (u) [CurlFollowLocation True] :: IO CurlResponse) >>= (\r-> (html2title (respBody r)) >>= (\t-> return $ Page t (respBody r) (importURL u) (Stat "0")))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.