Created
December 15, 2011 17:02
-
-
Save astanin/1481868 to your computer and use it in GitHub Desktop.
GTT | Avvisi. Atom feed generator
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- A script to generate Atom feed from a feed-less webpage | |
import Network.HTTP | |
import Text.HTML.TagSoup | |
import Text.Atom.Feed | |
import Text.Atom.Feed.Export | |
import Text.XML.Light | |
import Data.List (isPrefixOf) | |
import Data.Time (getCurrentTime) | |
import Control.Monad (liftM) | |
import qualified Text.Html as H | |
gttURL = "http://www.comune.torino.it/gtt/avvisi/index.shtml" | |
gttBaseURL = "http://www.comune.torino.it" | |
openURL x = getResponseBody =<< simpleHTTP (getRequest x) | |
avvisoStart = [TagOpen "div" [("class","avviso")]] | |
avvisoEnd = [TagClose "div"] | |
-- | Extract a section from the list. Use @before@ and @after@ markers | |
-- to find the beginning and the end of the section. Return section and | |
-- the unprocessed part of the list. | |
getSection :: (Eq a) => [a] -> [a] -> [a] -> ([a], [a]) | |
getSection _ _ [] = ([], []) | |
getSection before after xs | |
| before `isPrefixOf` xs = | |
let xs' = drop (length before) xs | |
in splitOn after [] xs' | |
| otherwise = getSection before after (drop 1 xs) | |
where | |
splitOn mrk acc rest@(x:xs) | |
| mrk `isPrefixOf` rest = (reverse acc, drop (length mrk) rest) | |
| otherwise = splitOn mrk (x:acc) xs | |
splitOn _ acc [] = (reverse acc, []) | |
getAllSections tags = getAll [] tags | |
where | |
getAll found [] = found | |
getAll found tags = | |
let (sec,rest) = getSection avvisoStart avvisoEnd tags | |
in getAll (found ++ sec) rest | |
extractLinks tags = extract [] tags | |
where | |
extract links ((TagOpen "a" attrs):(TagText txt):TagClose "a":rest) = | |
let href = maybe "" (gttBaseURL ++) $ lookup "href" attrs | |
in extract ((href, txt):links) rest | |
extract links [] = reverse links | |
extract links rest = extract links (drop 1 rest) | |
toItem date (href,txt) = | |
let e0 = nullEntry href (TextString txt) date | |
in e0 { entryLinks = links, entrySummary = Just (HTMLString html) } | |
where | |
html = H.prettyHtml $ H.anchor (H.stringToHtml txt) H.! [H.href href] | |
links = [ (nullLink href) { linkTitle = Just txt } ] | |
main = do | |
page <- openURL gttURL | |
now <- show `liftM` getCurrentTime | |
let tags = parseTags page | |
let tags' = getAllSections tags | |
let links = extractLinks tags' | |
let emptyFeed = nullFeed gttURL (TextString "GTT | Avvisi") now | |
let items = map (toItem now) links | |
let feed = emptyFeed { feedEntries = items } | |
let ppxml = ppTopElement (xmlFeed feed) | |
putStr ppxml | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment