Skip to content

Instantly share code, notes, and snippets.

@andymatuschak
Created January 20, 2010 22:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andymatuschak/282332 to your computer and use it in GitHub Desktop.
Save andymatuschak/282332 to your computer and use it in GitHub Desktop.
module Main where
import qualified Data.Set as Set
import qualified Data.Heap as Heap
import qualified Data.IntMap as IntMap
import Data.List
import Control.Monad
import Maybe
import System.Environment (getArgs)
import System.Process
import System.IO
import Control.Applicative ((<$>))
type Links = Set.Set String
type LinkScore = (Int, Int) -- (URL complexity: # of "/&?", epoch number)
type LinkHeap = Heap.MinPrioHeap LinkScore String
type Histogram = IntMap.IntMap Int
pagesToVisit = 2000
crawl :: String -> IO Histogram
crawl startingPage = do
(_, _, histogram) <- crawlStep 1 Set.empty startingHeap IntMap.empty
return histogram
where startingHeap :: Heap.MinPrioHeap (LinkScore) String
startingHeap = Heap.singleton ((0,0), startingPage)
crawlStep :: Int -- ^ Epoch number
-> Links -- ^ Set of visited pages
-> LinkHeap -- ^ Heap of pages to visit
-> Histogram -- ^ Histogram of link counts
-> IO (Links, LinkHeap, Histogram) -- ^ Updated structures
crawlStep epoch visited toVisit histogram =
if ((Set.size visited) >= pagesToVisit || Heap.isEmpty toVisit)
then return (visited, toVisit, histogram)
else do
let ((_, nextURL), toVisit') = fromJust $ Heap.view toVisit
hPutStrLn stderr $ (show $ Set.size visited) ++ "/" ++
(show pagesToVisit) ++ ": " ++ nextURL
links <- linksOnPage nextURL
let visited' = Set.insert nextURL visited
let filteredLinks = filterLinks links visited' toVisit'
let toVisit'' = recordLinks filteredLinks score toVisit'
let histogram' = IntMap.alter (Just . maybe 1 succ) (Set.size links) histogram
crawlStep (epoch + 1) visited' toVisit'' histogram'
where score link = (foldl (\t c -> t + length (elemIndices c link)) 0 "/&?", epoch)
linksOnPage :: String -> IO (Links)
linksOnPage url = do
(_, output, errors) <- readProcessWithExitCode "python" ["fetcher.py", url] ""
if null errors
then return $ Set.fromList (lines output)
else do hPutStrLn stderr $ "ERROR on " ++ url ++ ": " ++ errors
return Set.empty
recordLinks :: Links -- ^ Set of links to add
-> (String -> LinkScore) -- ^ Score function
-> LinkHeap -- ^ Heap of pages to visit
-> LinkHeap -- ^ Updated pages to visit
recordLinks linksToAdd score toVisit =
Heap.union toVisit $ Heap.fromList (Set.toList heapItemSet)
where heapItemSet = Set.map (\l -> (score l, l)) linksToAdd
filterLinks :: Links -> Links -> LinkHeap -> Links
filterLinks links visited toVisit =
let unvisited = Set.difference links visited
toVisitSet = Set.fromList $ map snd (Heap.toList toVisit)
rawLinks = Set.difference unvisited toVisitSet
in Set.filter (\l -> isCaltech l && isNotDoc l) rawLinks
where isCaltech = isInfixOf ".caltech.edu"
isNotDoc s = not . or $ map (\ext -> isSuffixOf ext s) invalidExts
invalidExts = [".pdf", ".doc", ".jpg"]
main :: IO ()
main = do
args <- getArgs
case (length args) of
1 -> do histogram <- crawl $ head args
(putStr . showResults) histogram
otherwise -> putStrLn "Usage: Crawler http://..."
where showResults hist = IntMap.foldWithKey (\k x ks -> ks ++ (showResult k x)) "" hist
showResult num count = (show num) ++ "\t" ++ (show count) ++ "\n"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment