Skip to content

Instantly share code, notes, and snippets.

@cmoore
Created April 1, 2012 22:19
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save cmoore/2279125 to your computer and use it in GitHub Desktop.
Save cmoore/2279125 to your computer and use it in GitHub Desktop.
module Main where
import qualified Data.ByteString.Char8 as B
import Data.Tree.NTree.TypeDefs
import Data.Maybe
import Text.XML.HXT.XPath
import Text.XML.HXT.Core
import Control.Monad
import Control.Monad.Trans
import Control.Monad.Trans.Maybe
import Network.HTTP
import Network.URI
import System.Environment
import Control.Concurrent.ParallelIO
-- helper function for getting page content
openUrl :: String -> MaybeT IO String
openUrl url = case parseURI url of
Nothing -> fail ""
Just u -> liftIO (getResponseBody =<< simpleHTTP (mkRequest GET u))
cssToXPath :: String -> String
cssToXPath s = "//" ++ s
css :: ArrowXml a => String -> a XmlTree XmlTree
css = getXPathTreesInDoc . cssToXPath
get :: String -> IO (IOSArrow XmlTree (NTree XNode))
get url = do
contents <- runMaybeT $ openUrl url
return $ readString [withParseHTML yes, withWarnings no] (fromMaybe "" contents)
images :: ArrowXml cat => cat a XmlTree -> cat a String
images tree = tree >>> css "img" >>> getAttrValue "src"
parseArgs :: IO String
parseArgs = do
args <- getArgs
case args of
(url:[]) -> return url
_ -> error "usage: grabber [url]"
download :: String -> IO ()
download url = do
content <- runMaybeT $ openUrl url
case content of
Nothing -> putStrLn $ "bad url: " ++ url
Just _content -> do
let name = tail . uriPath . fromJust . parseURI $ url
B.writeFile name (B.pack _content)
main :: IO ()
main = do
url <- parseArgs
doc <- get url
imgs <- runX . images $ doc
parallel_ $ map download imgs
stopGlobalPool
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment