Skip to content

Instantly share code, notes, and snippets.

@astanin
Created December 15, 2011 17:02
Show Gist options
  • Save astanin/1481868 to your computer and use it in GitHub Desktop.
Save astanin/1481868 to your computer and use it in GitHub Desktop.
GTT | Avvisi. Atom feed generator
-- A script to generate Atom feed from a feed-less webpage
import Network.HTTP
import Text.HTML.TagSoup
import Text.Atom.Feed
import Text.Atom.Feed.Export
import Text.XML.Light
import Data.List (isPrefixOf)
import Data.Time (getCurrentTime)
import Control.Monad (liftM)
import qualified Text.Html as H
gttURL = "http://www.comune.torino.it/gtt/avvisi/index.shtml"
gttBaseURL = "http://www.comune.torino.it"
openURL x = getResponseBody =<< simpleHTTP (getRequest x)
avvisoStart = [TagOpen "div" [("class","avviso")]]
avvisoEnd = [TagClose "div"]
-- | Extract a section from the list. Use @before@ and @after@ markers
-- to find the beginning and the end of the section. Return section and
-- the unprocessed part of the list.
getSection :: (Eq a) => [a] -> [a] -> [a] -> ([a], [a])
getSection _ _ [] = ([], [])
getSection before after xs
| before `isPrefixOf` xs =
let xs' = drop (length before) xs
in splitOn after [] xs'
| otherwise = getSection before after (drop 1 xs)
where
splitOn mrk acc rest@(x:xs)
| mrk `isPrefixOf` rest = (reverse acc, drop (length mrk) rest)
| otherwise = splitOn mrk (x:acc) xs
splitOn _ acc [] = (reverse acc, [])
getAllSections tags = getAll [] tags
where
getAll found [] = found
getAll found tags =
let (sec,rest) = getSection avvisoStart avvisoEnd tags
in getAll (found ++ sec) rest
extractLinks tags = extract [] tags
where
extract links ((TagOpen "a" attrs):(TagText txt):TagClose "a":rest) =
let href = maybe "" (gttBaseURL ++) $ lookup "href" attrs
in extract ((href, txt):links) rest
extract links [] = reverse links
extract links rest = extract links (drop 1 rest)
toItem date (href,txt) =
let e0 = nullEntry href (TextString txt) date
in e0 { entryLinks = links, entrySummary = Just (HTMLString html) }
where
html = H.prettyHtml $ H.anchor (H.stringToHtml txt) H.! [H.href href]
links = [ (nullLink href) { linkTitle = Just txt } ]
main = do
page <- openURL gttURL
now <- show `liftM` getCurrentTime
let tags = parseTags page
let tags' = getAllSections tags
let links = extractLinks tags'
let emptyFeed = nullFeed gttURL (TextString "GTT | Avvisi") now
let items = map (toItem now) links
let feed = emptyFeed { feedEntries = items }
let ppxml = ppTopElement (xmlFeed feed)
putStr ppxml
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment