Created
January 23, 2012 06:52
-
-
Save matthewSorensen/1661295 to your computer and use it in GitHub Desktop.
Fetches + deconstructs RSS/Atom feeds.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE OverloadedStrings #-} | |
module Quasivore.RSS (fetchRSS, RSS (..)) | |
where | |
import Quasivore.Post (Post,post) | |
import Data.Conduit | |
import Network.HTTP.Conduit hiding (proxy) | |
import Text.XML | |
import Text.XML.Cursor | |
import Data.Maybe | |
import Data.ByteString.Char8 (pack) | |
import Data.Text.Lazy (fromStrict) | |
import Control.Applicative ((<$>),(<*>)) | |
fetchRSS::RSS->IO [Post] | |
fetchRSS = fmap parseRSS . fetchXML | |
data RSS = Feed { | |
uri::String, | |
auth::Maybe (String,String), | |
proxy::Maybe (String,Int) | |
} deriving (Show,Eq) | |
-- Currently using withManager for simplicity. In a multi-threaded bit, we can | |
-- figure out how to just use a single one. | |
fetchXML::RSS->IO Document | |
fetchXML r = withManager $ \man -> do | |
Response _ _ bsrc <- http (rssToRequest r) man | |
bsrc $$ sinkDoc def | |
-- I would like this to support https, but can't seem to get that working with | |
-- http-conduit. | |
rssToRequest::RSS->Request m | |
rssToRequest r = aut r $ prox r $ url $ uri r | |
where url = fromJust . parseUrl . ("http://"++) | |
prox = maybe id (\(u,p)-> addProxy (pack u) p) . proxy | |
aut = maybe id (\(u,p)-> applyBasicAuth (pack u) (pack p)) . auth | |
-- We'll do all of our parsing using the list monad as our failure monad. | |
(<|>) a b cursor = case a cursor of | |
[] -> b cursor | |
x -> x | |
-- Denotes a value that can default to "" | |
optional [] = [""] | |
optional (x:_) = [fromStrict x] | |
-- A value whose absence will cause a parse failure. | |
required [] = [] | |
required (x:_) = [fromStrict x] | |
parseRSS = (>>= assemblePosts . child) . trimDoc . fromDocument | |
where assemblePosts doc = do | |
-- Get the source and list of feed-items from the document: | |
src <- source doc | |
items <- rawPosts doc | |
-- Now build a "single" post | |
item <- child items | |
pUrl <- required $ url item | |
pTitle <- optional $ title item | |
return $ post src pUrl pTitle | |
url = guid <|> link <|> source | |
where guid = laxElement "guid" >=> content | |
link = laxElement "link" >=> (attribute "href" <|> (child >=> content)) | |
source = laxElement "source" >=> attribute "url" | |
title = laxElement "title" >=> child >=> content | |
trimDoc = (laxElement "rss" >=> child >=> laxElement "channel") <|> laxElement "feed" | |
rawPosts = (>>= laxElement "item" <|> laxElement "entry") | |
source = optional . (>>= laxElement "link" >=> (attribute "href" <|> (child >=> content))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment