Skip to content

Instantly share code, notes, and snippets.

@sinelaw
Last active August 29, 2015 14:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sinelaw/746680f6a0183c3c3b54 to your computer and use it in GitHub Desktop.
Save sinelaw/746680f6a0183c3c3b54 to your computer and use it in GitHub Desktop.
tagsoup web scraping demo
module WikiNews where
import Network.HTTP(simpleHTTP, getRequest, getResponseBody)
import Data.List(isInfixOf, intersperse)
import Data.Char(isSpace, toLower)
import Control.Category((>>>))
import System.Random(getStdGen)
-- from package tagsoup:
import Text.HTML.TagSoup(parseTags, innerText, Tag)
import Text.HTML.TagSoup.Match(tagText)
-- from package markov-chain:
import Data.MarkovChain as MC
openURL :: String -> IO String
openURL x = getResponseBody =<< simpleHTTP (getRequest x)
trim :: String -> String
trim = dropWhile isSpace
filterSection :: [Tag String] -> String -> String -> [String]
filterSection tags startStr endStr = filterBetween >>> innerText >>> lines >>> map trim >>> nonEmpty $ tags
where notStarted = not . tagText (isInfixOf startStr)
notEnded = not . tagText (isInfixOf endStr)
filterBetween = init . takeWhile notEnded . drop 1 . dropWhile notStarted
nonEmpty = filter ("" /=)
main :: IO ()
main = do
-- Demo 1: extracting the news section of the wikipedia front page
src <- openURL "http://en.wikipedia.org/wiki/Main_Page"
let tags = parseTags src
_ <- mapM putStrLn $ filterSection tags "In the news" "Ongoing"
-- Demo 2: generating a random "summary" of go's FAQ page, using the markov-chain package
stdGen <- getStdGen
vmsrc <- openURL "http://golang.org/doc/faq" --"http://www.vmware.com/virtualization/virtualization-basics/what-is-virtualization.html"
let vmtags = parseTags vmsrc
text = words . map toLower . concat . intersperse " " $ filterSection vmtags "No major systems language has emerged" "Build version go"
putStrLn "\nSummary:"
putStrLn . concat . intersperse " " . take 150 $ MC.run 1 text 13 stdGen
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment