Created
July 20, 2011 04:54
-
-
Save Lambdanaut/1094364 to your computer and use it in GitHub Desktop.
A spider for crawling Sheezyart.com and determining social connections between users. It has also been confirmed to work on Deviantart.com with slight adjustment.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module SAScan where | |
import Network.Shpider | |
import System.Posix.Unistd | |
import Data.List | |
import Char | |
firstUser = "theprototype" | |
toLoad = False | |
savedFile = "SASaved" | |
depth = 1000 | |
data User = User {gName :: String, | |
gWatchers :: [String], | |
gWatching :: [String]} deriving (Read,Show,Eq) | |
--Util-- | |
-- Parses a saved file from a crawling session into a list of users | |
getUsers file = do | |
appendFile file "" | |
f <- readFile file | |
let (users:_) = splitOn ('|' ==) f | |
return (read users :: [User]) | |
-- Takes a list of usernames and returns a list of usernames that watch all the usernames supplied | |
watchersOf :: [String] -> [User] -> [String] | |
watchersOf u1 u2 = sort $ map gName $ filter (\z -> all (\y -> elem y (gWatching z)) u1) u2 | |
-- Takes a username and guesses at who they're watching | |
watching :: String -> [User] -> [String] | |
watching user users = sort . gWatching . head $ filter (\z -> gName z == user) users | |
-------- | |
splitOn :: (a -> Bool) -> [a] -> [[a]] | |
splitOn _ [] = [] | |
splitOn f l@(x:xs) | |
| f x = splitOn f xs | |
| otherwise = let (h,t) = break f l in h:(splitOn f t) | |
isJust (Just x) = True | |
isJust Nothing = False | |
start = newURL firstUser | |
newURL :: String -> String | |
newURL user = filter (/= ' ') $ "http://" ++ user ++ ".sheezyart.com/watch/" | |
mkNewUsers :: [String] -> User -> [User] -> [User] | |
mkNewUsers [] _ _ = [] | |
mkNewUsers (watcher:watchers) user users = (User username userwatchers userwatching) : mkNewUsers watchers user users | |
where User username userwatchers userwatching = case length $ filter (\y -> watcher == gName y) users of | |
x | x > 0 -> User watcher (gWatchers search) ((gName user) : (gWatching search)) | |
otherwise -> User watcher [] [gName user] | |
search = head $ filter (\y -> watcher == gName y) users | |
fuseCurUser :: User -> [User] -> [User] | |
fuseCurUser user users = User (gName user) watchers watching : notsearch | |
where search = filter (\y -> gName user == gName y) users | |
notsearch = filter (\y -> gName user /= gName y) users | |
watchers = gWatchers user ++ (concat $ map gWatchers search) | |
watching = gWatching user ++ (concat $ map gWatching search) | |
mkNewUser :: [String] -> [User] -> [String] -> User | |
mkNewUser [] users _ = User "ERROR" [] [] | |
mkNewUser (toVisit:toVisits) users watchers = case elem toVisit $ map gName users of | |
True -> addWatchers $ head $ filter (\y -> toVisit == gName y) users | |
False -> mkNewUser toVisits users watchers | |
where addWatchers user = User (gName user) (gWatchers user ++ watchers) (gWatching user) | |
getWatchers page = do | |
-- (errCode,pageF) <- getPage -- Only progresses if the errorCode returned from the page download is Ok. | |
(errCode,pageF) <- download page | |
setCurrentPage pageF | |
links <- getLinksByTextRegex "^\\S.*" | |
let users = sort $ map (filter (/= ' ') . (map toLower) . linkText) links | |
return users | |
where | |
getPage = do (errCode,pageF) <- download page | |
if errCode == Ok then return (errCode,pageF) else getPage | |
mainLoop :: String -> Int -> User -> [User] -> [String] -> [String] -> Shpider [User] | |
mainLoop page x user users toVisit visited | |
| x > depth = finish users toVisit visited | |
| otherwise = do | |
watchers <- getWatchers page | |
let newVisited = gName user : visited -- Add current user to visited list | |
let newToVisit = filter (\y -> not $ elem y newVisited) $ toVisit ++ watchers -- Add the watchers of the current user to the visit list. | |
let nUsers = fuseCurUser user (mkNewUsers watchers user users) | |
let newUsers = (filter (\y -> not $ elem (gName y) (map gName nUsers)) users) ++ nUsers | |
let newUser = mkNewUser newToVisit newUsers watchers | |
let newPage = newURL $ gName newUser | |
lift $ putStrLn $ gName user ++ " @ " ++ show x ++ "/" ++ show depth ++ " " ++ page -- Debug Info | |
mainLoop newPage (x + 1) newUser newUsers newToVisit newVisited | |
finish users toVisit visited = do | |
lift $ putStrLn $ "Scanner finished! \nProgress file saved in: \"" ++ savedFile ++ "\"." | |
lift $ writeFile savedFile $ (show users) ++ "|" ++ (show toVisit) ++ "|" ++ (show visited) | |
return users | |
loadFile = do | |
lift $ appendFile savedFile "" | |
f <- lift $ readFile savedFile | |
let (users:toVisit:visited:_) = splitOn ('|' ==) f | |
if toLoad then return (read users :: [User],read toVisit :: [String],read visited :: [String]) else return ([],[],[]) | |
initialize = do | |
onlyDownloadHtml True | |
stayOnDomain True | |
setCurrentPage . snd =<< download start | |
mainSphider = runShpider $ do | |
initialize | |
(users,toVisit,visited) <- loadFile | |
watchers <- getWatchers $ newURL firstUser | |
mainLoop start 1 (mkNewUser [firstUser] [User firstUser [] []] watchers) users toVisit visited | |
main = do | |
z <- mainSphider | |
return z |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment