Skip to content

Instantly share code, notes, and snippets.

@JackDanger
Created July 3, 2012 17:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JackDanger/3041248 to your computer and use it in GitHub Desktop.
Save JackDanger/3041248 to your computer and use it in GitHub Desktop.
(ns crawler.crawler
(:gen-class)
(:require [net.cgrand.enlive-html :as html]
org.bovinegenius.exploding-fish))
(def crawled (atom #{}))
(defn fetch
"Retrieve contents of a page at a given url"
[url]
(try
(html/html-resource (java.net.URL. url))
(catch Exception e (prn (str "unable to retrieve " url)))
(finally (swap! crawled conj url))))
(defn base-url
"Return a uri up until the path"
[url]
(let [u (org.bovinegenius.exploding-fish/uri url)]
(str
(org.bovinegenius.exploding-fish/scheme u)
"://"
(org.bovinegenius.exploding-fish/host u))))
(defn absolutize
"make any url an absolute url if the domain needs to be prepended"
[from href]
(if (= \/ (first href))
(str (base-url from) href)
href))
(defn links-inside
"returns a list of links in the html content of a given url"
[url]
(println url)
(map #(absolutize url %) (map :href (map :attrs (html/select (fetch url) [:a])))))
(defn run
"examines each entry in the queue, appending to the queue while doing so"
[start]
(loop [queue [start]]
(println "Queue size: " (count queue))
(if-let [link (first queue)]
(recur
(if (contains? crawled link)
(rest queue)
(rest (concat queue (links-inside link))))))))
(defn -main
"Crawls the whole web, beginning at Google News"
[]
(run "http://news.google.com/"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment