Skip to content

Instantly share code, notes, and snippets.

@matthewSorensen
Created January 23, 2012 06:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save matthewSorensen/1661295 to your computer and use it in GitHub Desktop.
Save matthewSorensen/1661295 to your computer and use it in GitHub Desktop.
Fetches + deconstructs RSS/Atom feeds.
{-# LANGUAGE OverloadedStrings #-}
module Quasivore.RSS (fetchRSS, RSS (..))
where
import Quasivore.Post (Post,post)
import Data.Conduit
import Network.HTTP.Conduit hiding (proxy)
import Text.XML
import Text.XML.Cursor
import Data.Maybe
import Data.ByteString.Char8 (pack)
import Data.Text.Lazy (fromStrict)
import Control.Applicative ((<$>),(<*>))
fetchRSS::RSS->IO [Post]
fetchRSS = fmap parseRSS . fetchXML
data RSS = Feed {
uri::String,
auth::Maybe (String,String),
proxy::Maybe (String,Int)
} deriving (Show,Eq)
-- Currently using withManager for simplicity. In a multi-threaded bit, we can
-- figure out how to just use a single one.
fetchXML::RSS->IO Document
fetchXML r = withManager $ \man -> do
Response _ _ bsrc <- http (rssToRequest r) man
bsrc $$ sinkDoc def
-- I would like this to support https, but can't seem to get that working with
-- http-conduit.
rssToRequest::RSS->Request m
rssToRequest r = aut r $ prox r $ url $ uri r
where url = fromJust . parseUrl . ("http://"++)
prox = maybe id (\(u,p)-> addProxy (pack u) p) . proxy
aut = maybe id (\(u,p)-> applyBasicAuth (pack u) (pack p)) . auth
-- We'll do all of our parsing using the list monad as our failure monad.
(<|>) a b cursor = case a cursor of
[] -> b cursor
x -> x
-- Denotes a value that can default to ""
optional [] = [""]
optional (x:_) = [fromStrict x]
-- A value whose absence will cause a parse failure.
required [] = []
required (x:_) = [fromStrict x]
parseRSS = (>>= assemblePosts . child) . trimDoc . fromDocument
where assemblePosts doc = do
-- Get the source and list of feed-items from the document:
src <- source doc
items <- rawPosts doc
-- Now build a "single" post
item <- child items
pUrl <- required $ url item
pTitle <- optional $ title item
return $ post src pUrl pTitle
url = guid <|> link <|> source
where guid = laxElement "guid" >=> content
link = laxElement "link" >=> (attribute "href" <|> (child >=> content))
source = laxElement "source" >=> attribute "url"
title = laxElement "title" >=> child >=> content
trimDoc = (laxElement "rss" >=> child >=> laxElement "channel") <|> laxElement "feed"
rawPosts = (>>= laxElement "item" <|> laxElement "entry")
source = optional . (>>= laxElement "link" >=> (attribute "href" <|> (child >=> content)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment