Skip to content

Instantly share code, notes, and snippets.

@lotz84
Last active January 30, 2016 07:26
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lotz84/c870d6a3f9a97e0fc8b5 to your computer and use it in GitHub Desktop.
Save lotz84/c870d6a3f9a97e0fc8b5 to your computer and use it in GitHub Desktop.
{-# LANGUAGE OverloadedStrings #-}
import Control.Concurrent (threadDelay)
import Control.Lens
import Control.Monad.IO.Class (liftIO)
import Control.Monad.State
import Data.List (nub, isPrefixOf)
import Data.Text.Lazy.Encoding (decodeUtf8)
import Data.Text.Lens (unpacked)
import qualified Network.Wreq as Wreq
import Text.Taggy.Lens
baseUrl :: String
baseUrl = "http://www.example.com/"
scraping :: Int -> String -> StateT [String] IO ()
scraping 0 _ = pure () -- 終了条件
scraping n url = do
seen <- gets (elem url) -- 既に訪れたどうか
unless seen $ do -- 既に訪れていれば何もしない
liftIO . putStrLn $ show n ++ ": " ++ url -- 何回目でどこに来たかを表示する
res <- liftIO $ Wreq.get url -- コンテンツをダウンロードする
seen <- modify (url:) >> get -- 既に見たURLのリストを更新してseenに束縛する
let next = nub $ res ^.. Wreq.responseBody . to decodeUtf8 . html -- HTML
. allNamed (only "a") . attrs . at "href" . _Just -- 全てのaタグからhref属性の値を存在すれば取り出す
. unpacked . filtered (isPrefixOf baseUrl) -- Stringに変換してルートがbaseUrlに一致するものだけ取り出す
. filtered (not . (`elem` seen)) -- 既に訪れたところは取り出さない
liftIO $ threadDelay 1000000 -- DoS攻撃にならないように1秒間待つ
mapM_ (scraping (n-1)) next -- 再帰的にスクレイピングしていく
main :: IO ()
main = evalStateT (scraping 3 baseUrl) []
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment