Skip to content

Instantly share code, notes, and snippets.

@ninegua
Created April 22, 2015 04:37
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ninegua/923175791dec7a8a3259 to your computer and use it in GitHub Desktop.
Save ninegua/923175791dec7a8a3259 to your computer and use it in GitHub Desktop.
Weibo Scrapper (using conduit == 0.2.*) written in 2012
{-# LANGUAGE OverloadedStrings, RankNTypes, ScopedTypeVariables,
NoMonomorphismRestriction, DeriveDataTypeable #-}
module Main where
import Prelude hiding (and, catch)
import Data.Char (toLower)
import Data.Conduit
import Data.Conduit.Util
import Data.Conduit.ImageSize (sinkImageInfo)
import Data.Conduit.Binary (sourceFile, conduitFile, sinkFile)
import Data.Conduit.Lazy (lazyConsume)
import qualified Data.Conduit.List as CL
import Network.HTTP.Conduit (parseUrl, Response (..), urlEncodedBody, withManager, Cookie(..))
import Network.HTTP.Conduit.Browser (BrowserAction, setCookieFilter, browse, makeRequest, setUserAgent)
import Text.HTML.TagStream (Token, Token'(..), tokenStream, encode)
import qualified Network.HTTP.Types as T
import System.Environment (getArgs)
import System.IO (stderr, hPutStrLn, getContents)
import System.Directory (renameFile)
import System.Random (randomRIO)
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Lazy.UTF8 as LU
import qualified Data.ByteString.UTF8 as BU
import Control.Monad (when, (=<<))
import Control.Monad.IO.Class (liftIO)
import Control.Monad.State (StateT)
import qualified Control.Monad.State as S
import Control.Monad.Trans (lift)
import Control.Monad.ST (ST, runST)
import Control.Arrow ((***), first, second)
import Control.Exception.Lifted
import Data.Monoid (Monoid, mappend, mempty)
import qualified Data.String as DS
import Data.List (splitAt, partition, dropWhile, takeWhile, span,
isInfixOf, isPrefixOf, intercalate)
import Data.Typeable (Typeable)
import Data.CaseInsensitive (mk)
import qualified Data.Map as M
import Data.String.Utils (split, replace)
import Control.Concurrent (threadDelay, newMVar, withMVar, modifyMVar)
iphoneAgent = "Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1A543a Safari/419.3"
prefix = "http://3g.sina.com.cn/prog/wapsite/sso"
weibo = "http://weibo.cn"
main = do
login : pass : _ <- getArgs
cookieCache <- newMVar M.empty
withManager (flip browse $
setCookieFilter (cookieFilter cookieCache) >>
setUserAgent (BU.fromString iphoneAgent) >>
startLogin (BU.fromString login) (BU.fromString pass) >>=
afterLogin >>=
myWeiqun >>=
countPages "/qun/detail/" >>=
allMyQunPages >>=
debug . show)
cookieFilter cookieCache _ cookie = do
modifyMVar cookieCache $ \cache -> do
let name = BU.toString $ cookie_name cookie
value = BU.toString $ cookie_value cookie
cache' = M.insert name value cache
debug $ "Set Cookie: " ++ name ++ " " ++ value
return (cache', True)
consume file src = lift $ src $= dump file $$ CL.consume
data MyException
= MalURL
| ErrorResponse String
| LoginURLNotFound
| LoginFormNotFound
| UIDNotFound
| HomeURLNotFound
| MyWeiboURLNotFound
| PageListNotFound
| PageTotalNotFound
| PageTotalNotNumber
| CommentTotalNotFound
| CommentTotalNotNumber
deriving (Show, Typeable)
instance Exception MyException
data Msg = Msg [Token] [[Token]]
instance Show Msg where
show (Msg m s) = BU.toString $ B.intercalate "\n" $
"Text: " : p m : "Comment: " : map p s
where p = B.append " " . encode
msgToHtml (Msg m s) =
addDiv "message" $ B.concat $
[encode m, addDiv "comments" $ B.concat $ map (addDiv "comment" . encode) s]
where addDiv c s = B.concat ["<div class=\"", c, "\">", s, "</div>"]
instance Monoid Msg where
mappend (Msg a b) (Msg c d) = (Msg a (b ++ d))
mempty = Msg [] []
debug = liftIO . hPutStrLn stderr
send' msg modReq url = do
case parseUrl url of
Nothing -> throwIO MalURL
Just req -> do
let req' = modReq req
debug $ msg ++ url
liftIO $ randomRIO (0,4::Int) >>= threadDelay . (*1000000)
Response status headers body <- makeRequest req'
debug $ unlines $ ["Status: " ++ show status,
"Headers: " ++ show headers]
case T.statusCode status of
302 -> return $ Right status
200 -> return $ Left body
_ -> return $ Right status
send msg modReq url = do
x <- send' msg modReq url
case x of
Right status -> throwIO $ ErrorResponse $ show status
Left body -> return $ body $= tokenStream
post form = send "Post to: " $ urlEncodedBody (M.toList form)
get = send "Get from: " id
getFile = fmap (either id (const $ CL.sourceList [])) . send' "Get file from: " id
dump = mapConduit (encode . (:[])) . conduitFile
matchTag tag key check (TagOpen name attrs _) = tag == name && (maybe False check $ lookup key attrs)
matchTag _ _ _ _ = False
pickAttr name = maybe [] ((:[]) . amp . BU.toString) . lookup name
parseHref = CL.concatMap $ pickAttr "href" . fst
parseForm = CL.concatMap $ uncurry zip .
(pickAttr "action" *** (:[]) . M.fromList . concatMap isInput)
where
isInput (TagOpen a attrs _) | a == "input" =
case (lookup "name" attrs, lookup "value" attrs) of
(Just n, Just v) -> [(n, v)]
(Just n, _) -> [(n, B.empty)]
_ -> []
isInput _ = []
parseRefresh = CL.concatMap $ map (dropWhile (/='h')) . pickAttr "content" . fst
grabTag match = grabTag' match =$= CL.concatMap (either (:[]) (const []))
grabTag' match = sequenceSink () $ \() -> do
t <- CL.head
return $
case t of
Just x@(TagOpen tag attrs closed) | match x ->
if closed then Emit () [Left (attrs, [])]
else StartConduit $ grabClose tag attrs
Just x -> Emit () [Right x]
Nothing -> Stop
where
grabClose tag attrs = sequenceSink (Just (0, [])) $ \state -> do
case state of
Nothing -> return $ StartConduit $ grabTag' match
Just (l, xs) -> do
t <- CL.head
return $
case t of
Just x ->
case x of
TagOpen a _ _ | a == tag -> Emit (Just (l + 1, x:xs)) []
TagClose a | a == tag -> if l > 0 then Emit (Just (l - 1, x:xs)) []
else Emit Nothing [Left (attrs, reverse xs)]
_ -> Emit (Just (l, x:xs)) []
Nothing -> Stop
startLogin login pass = do
src <- get weibo
login <- throwWhenNothing LoginURLNotFound $ lift $ src
$= grabTag (matchTag "a" "href" $ B.isInfixOf "sso/login")
$= parseHref $$ CL.head
src <- get login
(url, form) <- throwWhenNothing LoginFormNotFound $ lift $ src
$= grabTag (matchTag "form" "action" $ const True)
$= parseForm $$ CL.head
post (fill form) (prefix </> url)
where
f p q k v = if B.isPrefixOf p k then q else v
fill = M.insert "mobile" login . M.mapWithKey (f "password" pass)
afterLogin src = (get =<<) $
throwWhenNothing HomeURLNotFound $ lift $ src
$= grabTag (matchTag "meta" "http-equiv" (=="refresh"))
$= parseRefresh $$ CL.head
myWeiqun src = (get . (weibo </>) =<<) $
throwWhenNothing MyWeiboURLNotFound $ lift $ src
$= grabTag (matchTag "a" "href" $ B.isInfixOf "/qun?")
$= parseHref $$ CL.head
countPages formStr src = do
(url, form) <- throwWhenNothing PageListNotFound $ lift $ src $= dump "main.html"
$= dropUntil (matchTag "div" "id" (=="pagelist"))
$= grabTag (matchTag "form" "action" $ B.isInfixOf formStr)
$= parseForm $$ CL.head
debug $ show form
totals <- throwWhenNothing PageTotalNotFound $
return $ fmap BU.toString $ M.lookup "mp" form
total <- liftIO $ catch (readIO totals :: IO Int) (throw' PageTotalNotNumber)
return (url, total, form)
parseMsg = parseMsg' 1
parseMsg' i formStr src = do
(msg, (cmts, pgs)) <- lift $ src $$ fmap accum $
grabTag' (matchTag "div" "id" $ B.isPrefixOf "M_") =|=
grabTag' (matchTag "div" "id" $ B.isPrefixOf "C_") =|=
(grabTag (matchTag "form" "action" $ B.isInfixOf formStr) =$= parseForm) =$
CL.consume
debug $ "Got msg form: " ++ show pgs
let m = Msg (concat $ map snd msg) (map snd cmts)
j = i + 1
case pgs of
(url, form):_ -> do
s <- throwWhenNothing CommentTotalNotFound $
return $ fmap BU.toString $ M.lookup "mp" form
n <- liftIO $ catch (readIO s :: IO Int) (throw' CommentTotalNotNumber)
if j <= n
then fmap (mappend m) $
post (M.insert "page" (showB j) form) (weibo </> url) >>=
parseMsg' j formStr . ($= dump "second.html")
else return m
_ -> return m
where
aux f g = either (first . f) (second . g)
accum = foldr (aux (:) (aux (:) (:))) ([], ([], []))
throw' :: MyException -> SomeException -> a
throw' e _ = throw e
throwWhenNothing e io = io >>= maybe (throwIO e) return
amp = replace "&amp;" "&"
(</>) p q = if isPrefixOf "http://" q then q
else p ++ "/" ++ (dropWhile (=='/') q)
showB = BU.fromString . show
myQunPage url form i = do
src <- post form' (weibo </> url)
urls <- lift $ src
$= grabTag (matchTag "div" "id" $ B.isPrefixOf "M_")
$= parseDivId $$ CL.consume
mapM_ ((>>= printMsg) . get . (weibo </>)) urls
where
printMsg src = parseMsg "act=view" src >>= dumpMsg >>= liftIO . B.putStrLn . msgToHtml
form' = M.insert "page" (showB i) form
idToURL id = "/dpool/ttt/grouphome.php?act=view&gmid=" ++ drop 2 id ++ "&groupid=198197&" ++ (dropWhile(=='?') $ dropWhile (not . (=='?')) url)
parseDivId = CL.concatMap $ map idToURL . pickAttr "id" . fst
allMyQunPages (url, total, form) = mapM_ (myQunPage url form) [35..55]
dumpMsg (Msg body cmt) = do
body' <- fmap tagImage $ dumpImage body
return $ Msg body' cmt
tagImage (x@(TagOpen a attrs True) : _ : TagOpen b bttrs True : xs) | a == "img" && b == "a" =
TagOpen "a" bttrs False : x : TagClose "a" : tagImage xs
tagImage (x:xs) = x : tagImage xs
tagImage [] = []
dumpImage (TagOpen a _ _ : x@(TagOpen b attrs closed) : xs) | a == "a" && b == "img" = do
let src = pickAttr "src" attrs
xs' = dropUntilAfter (isTagClose "a") xs
debug $ "src = " ++ show src
y <- case src of
[url] -> do
let file = "thumb/" ++ (last $ split "/" url)
safeGuard () $ getFile (weibo </> url) >>= lift . ($$ sinkFile file)
return $ TagOpen b (replaceAttr "src" (BU.fromString file) attrs) closed
_ -> return x
fmap (y:) $ dumpImage xs'
dumpImage (x@(TagOpen a attrs False):y:xs) | a == "a" = do
let href = pickAttr "href" attrs
let next = fmap (x:) $ dumpImage (y:xs)
case (y, href) of
(Text y', [url]) | y' == BU.fromString "原图" -> do
let query = last $ split "/" url
args = map (split "=") $ split "&" $ dropUntilAfter (=='?') query
hash = concat $ concat $ map tail $ filter (isPrefixOf ["u"]) args
if null hash then next
else do
let file = "image/" ++ hash
safeGuard () $ getFile (weibo </> url) >>= lift . ($$ sinkFile file)
info <- safeGuard Nothing $ lift (sourceFile file $$ sinkImageInfo)
let file' = case info of
Just (_, format) -> file ++ "." ++ map toLower (show format)
_ -> file
liftIO $ when (file /= file') $ renameFile file file'
let xs' = dropUntilAfter (isTagClose "a") xs
x' = TagOpen "a" [("href", BU.fromString file')] True
fmap (x':) $ dumpImage xs'
_ -> next
dumpImage (x:xs) = fmap (x:) $ dumpImage xs
dumpImage [] = return []
replaceAttr key val attrs = (key, val) : filter ((/=key) . fst) attrs
isTagClose a (TagClose b) | a == b = True
| otherwise = False
isTagClose _ _ = False
dropUntilAfter f = dropWhile f . dropWhile (not . f)
safeGuard d io = catch io $ \(e :: SomeException) -> debug (show e) >> return d
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment