Skip to content

Instantly share code, notes, and snippets.

@Lambdanaut
Created July 20, 2011 04:54
Show Gist options
  • Save Lambdanaut/1094364 to your computer and use it in GitHub Desktop.
Save Lambdanaut/1094364 to your computer and use it in GitHub Desktop.
A spider for crawling Sheezyart.com and determining social connections between users. It has also been confirmed to work on Deviantart.com with slight adjustment.
module SAScan where
import Network.Shpider
import System.Posix.Unistd
import Data.List
import Char
firstUser = "theprototype"
toLoad = False
savedFile = "SASaved"
depth = 1000
data User = User {gName :: String,
gWatchers :: [String],
gWatching :: [String]} deriving (Read,Show,Eq)
--Util--
-- Parses a saved file from a crawling session into a list of users
getUsers file = do
appendFile file ""
f <- readFile file
let (users:_) = splitOn ('|' ==) f
return (read users :: [User])
-- Takes a list of usernames and returns a list of usernames that watch all the usernames supplied
watchersOf :: [String] -> [User] -> [String]
watchersOf u1 u2 = sort $ map gName $ filter (\z -> all (\y -> elem y (gWatching z)) u1) u2
-- Takes a username and guesses at who they're watching
watching :: String -> [User] -> [String]
watching user users = sort . gWatching . head $ filter (\z -> gName z == user) users
--------
splitOn :: (a -> Bool) -> [a] -> [[a]]
splitOn _ [] = []
splitOn f l@(x:xs)
| f x = splitOn f xs
| otherwise = let (h,t) = break f l in h:(splitOn f t)
isJust (Just x) = True
isJust Nothing = False
start = newURL firstUser
newURL :: String -> String
newURL user = filter (/= ' ') $ "http://" ++ user ++ ".sheezyart.com/watch/"
mkNewUsers :: [String] -> User -> [User] -> [User]
mkNewUsers [] _ _ = []
mkNewUsers (watcher:watchers) user users = (User username userwatchers userwatching) : mkNewUsers watchers user users
where User username userwatchers userwatching = case length $ filter (\y -> watcher == gName y) users of
x | x > 0 -> User watcher (gWatchers search) ((gName user) : (gWatching search))
otherwise -> User watcher [] [gName user]
search = head $ filter (\y -> watcher == gName y) users
fuseCurUser :: User -> [User] -> [User]
fuseCurUser user users = User (gName user) watchers watching : notsearch
where search = filter (\y -> gName user == gName y) users
notsearch = filter (\y -> gName user /= gName y) users
watchers = gWatchers user ++ (concat $ map gWatchers search)
watching = gWatching user ++ (concat $ map gWatching search)
mkNewUser :: [String] -> [User] -> [String] -> User
mkNewUser [] users _ = User "ERROR" [] []
mkNewUser (toVisit:toVisits) users watchers = case elem toVisit $ map gName users of
True -> addWatchers $ head $ filter (\y -> toVisit == gName y) users
False -> mkNewUser toVisits users watchers
where addWatchers user = User (gName user) (gWatchers user ++ watchers) (gWatching user)
getWatchers page = do
-- (errCode,pageF) <- getPage -- Only progresses if the errorCode returned from the page download is Ok.
(errCode,pageF) <- download page
setCurrentPage pageF
links <- getLinksByTextRegex "^\\S.*"
let users = sort $ map (filter (/= ' ') . (map toLower) . linkText) links
return users
where
getPage = do (errCode,pageF) <- download page
if errCode == Ok then return (errCode,pageF) else getPage
mainLoop :: String -> Int -> User -> [User] -> [String] -> [String] -> Shpider [User]
mainLoop page x user users toVisit visited
| x > depth = finish users toVisit visited
| otherwise = do
watchers <- getWatchers page
let newVisited = gName user : visited -- Add current user to visited list
let newToVisit = filter (\y -> not $ elem y newVisited) $ toVisit ++ watchers -- Add the watchers of the current user to the visit list.
let nUsers = fuseCurUser user (mkNewUsers watchers user users)
let newUsers = (filter (\y -> not $ elem (gName y) (map gName nUsers)) users) ++ nUsers
let newUser = mkNewUser newToVisit newUsers watchers
let newPage = newURL $ gName newUser
lift $ putStrLn $ gName user ++ " @ " ++ show x ++ "/" ++ show depth ++ " " ++ page -- Debug Info
mainLoop newPage (x + 1) newUser newUsers newToVisit newVisited
finish users toVisit visited = do
lift $ putStrLn $ "Scanner finished! \nProgress file saved in: \"" ++ savedFile ++ "\"."
lift $ writeFile savedFile $ (show users) ++ "|" ++ (show toVisit) ++ "|" ++ (show visited)
return users
loadFile = do
lift $ appendFile savedFile ""
f <- lift $ readFile savedFile
let (users:toVisit:visited:_) = splitOn ('|' ==) f
if toLoad then return (read users :: [User],read toVisit :: [String],read visited :: [String]) else return ([],[],[])
initialize = do
onlyDownloadHtml True
stayOnDomain True
setCurrentPage . snd =<< download start
mainSphider = runShpider $ do
initialize
(users,toVisit,visited) <- loadFile
watchers <- getWatchers $ newURL firstUser
mainLoop start 1 (mkNewUser [firstUser] [User firstUser [] []] watchers) users toVisit visited
main = do
z <- mainSphider
return z
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment