Last active
August 29, 2015 14:07
-
-
Save sinelaw/746680f6a0183c3c3b54 to your computer and use it in GitHub Desktop.
tagsoup web scraping demo
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module WikiNews where | |
import Network.HTTP(simpleHTTP, getRequest, getResponseBody) | |
import Data.List(isInfixOf, intersperse) | |
import Data.Char(isSpace, toLower) | |
import Control.Category((>>>)) | |
import System.Random(getStdGen) | |
-- from package tagsoup: | |
import Text.HTML.TagSoup(parseTags, innerText, Tag) | |
import Text.HTML.TagSoup.Match(tagText) | |
-- from package markov-chain: | |
import Data.MarkovChain as MC | |
openURL :: String -> IO String | |
openURL x = getResponseBody =<< simpleHTTP (getRequest x) | |
trim :: String -> String | |
trim = dropWhile isSpace | |
filterSection :: [Tag String] -> String -> String -> [String] | |
filterSection tags startStr endStr = filterBetween >>> innerText >>> lines >>> map trim >>> nonEmpty $ tags | |
where notStarted = not . tagText (isInfixOf startStr) | |
notEnded = not . tagText (isInfixOf endStr) | |
filterBetween = init . takeWhile notEnded . drop 1 . dropWhile notStarted | |
nonEmpty = filter ("" /=) | |
main :: IO () | |
main = do | |
-- Demo 1: extracting the news section of the wikipedia front page | |
src <- openURL "http://en.wikipedia.org/wiki/Main_Page" | |
let tags = parseTags src | |
_ <- mapM putStrLn $ filterSection tags "In the news" "Ongoing" | |
-- Demo 2: generating a random "summary" of go's FAQ page, using the markov-chain package | |
stdGen <- getStdGen | |
vmsrc <- openURL "http://golang.org/doc/faq" --"http://www.vmware.com/virtualization/virtualization-basics/what-is-virtualization.html" | |
let vmtags = parseTags vmsrc | |
text = words . map toLower . concat . intersperse " " $ filterSection vmtags "No major systems language has emerged" "Build version go" | |
putStrLn "\nSummary:" | |
putStrLn . concat . intersperse " " . take 150 $ MC.run 1 text 13 stdGen | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment