Skip to content

Instantly share code, notes, and snippets.

@master-q
Created January 5, 2012 09:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save master-q/1564437 to your computer and use it in GitHub Desktop.
Save master-q/1564437 to your computer and use it in GitHub Desktop.
GetAllPage.hs
#!/usr/bin/env runhaskell
{-# Language OverloadedStrings #-}
import Data.Text.Encoding
import qualified Data.Text.IO as DTIO
import Text.XmlHtml
import Text.XmlHtml.Cursor
import System.Process
import System.Exit
findLink :: [Node] -> [(String, String)]
findLink nodes = concat $ fmap go nodes
where
go :: Node -> [(String, String)]
go (Element {elementTag = "a", elementAttrs = [(_, ea)], elementChildren = [TextNode ec]}) =
[(show ea, show ec)]
go node | fc /= Nothing = let Just fcj = fc
in findLink $ siblings fcj
| otherwise = []
where fc = firstChild . fromNode $ node
curlIt :: (String, String) -> IO ExitCode
curlIt ss = system ("echo curl -d p=\"" ++ postp ++
"\" -d c=e http://www.sampou.org/cgi-bin/haskell.cgi -o \""
++ outfile ++ "\"")
where postp = tail . init $ dropWhile (/= '?') $ fst ss
outfile = (tail . init $ snd ss) ++ ".html"
main :: IO ()
main = do
con <- DTIO.getContents
let Right (HtmlDocument _ _ nodes) = parseHTML "stdin" $ encodeUtf8 con
let Just cur = fromNodes nodes
mapM_ curlIt $ findLink $ topNodes cur
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment