Skip to content

Instantly share code, notes, and snippets.

@reneklacan
Last active August 29, 2015 14:21
Show Gist options
  • Save reneklacan/262eef04a3bc67607dd8 to your computer and use it in GitHub Desktop.
Save reneklacan/262eef04a3bc67607dd8 to your computer and use it in GitHub Desktop.
Crawl a page and print a JSON with site info
{-# LANGUAGE OverloadedStrings #-}
import Data.Tree.NTree.TypeDefs
import Text.XML.HXT.Core
import Data.Text hiding (length,foldl)
import qualified Data.HashMap.Strict as M
import Data.Aeson
import Text.HandsomeSoup
data SiteInfo = SiteInfo
{ title :: Text
, githubUrl :: Text
, articleCount :: Int
, articleNames :: [Text]
} deriving Show
instance ToJSON SiteInfo where
toJSON (SiteInfo title githubUrl articleCount articleNames) =
object
[ "title" .= title
, "github_url" .= githubUrl
, "article_count" .= articleCount
, "article_names" .= articleNames
]
siteTitle :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree String
siteTitle tree = tree >>> css "a.site-title" //> getText
siteGithubUrl :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree String
siteGithubUrl tree = tree >>> css "a.fa-github" ! "href"
siteArticleNames :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree String
siteArticleNames tree = tree >>> css ".post-title" //> getText
getList
:: IOSArrow XmlTree (NTree XNode)
-> (IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree String)
-> IO [String]
getList doc selector = runX $ selector doc
getOne
:: IOSArrow XmlTree (NTree XNode)
-> (IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree String)
-> IO String
getOne doc selector = fmap (foldl (++) "") $ getList doc selector
getInfo :: IOSArrow XmlTree (NTree XNode) -> IO SiteInfo
getInfo doc = do
sTitle <- getOne doc siteTitle
sGithubUrl <- getOne doc siteGithubUrl
sArticleNames <- getList doc siteArticleNames
return
SiteInfo
{ title = pack sTitle
, githubUrl = pack sGithubUrl
, articleCount = length sArticleNames
, articleNames = fmap pack sArticleNames
}
get :: String -> IO (IOSArrow XmlTree (NTree XNode))
get url = do
return $ fromUrl url
main :: IO ()
main = do
doc <- get "http://rene.klacan.sk"
hash <- fmap encode $ getInfo doc
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment