Skip to content

Instantly share code, notes, and snippets.

@copy
Last active August 29, 2015 14:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save copy/ebb87e8da204de29c11c to your computer and use it in GitHub Desktop.
Save copy/ebb87e8da204de29c11c to your computer and use it in GitHub Desktop.
Crawler
(ns http-loader.core
(:gen-class))
(require '[clj-http.client :as client])
;(require '[pl.danieljanus.tagsoup :as parser])
(require '[net.cgrand.enlive-html :as html])
(require '[net.cgrand.tagsoup :as parser])
(require '[clojurewerkz.urly.core :as url])
(defn grab [url]
(let [response (client/get url {:throw-exceptions false})]
(response :body)))
(defn parse-html [html-data]
(html/html-resource (java.io.StringReader. html-data)))
(defn find-links [html-data base-url]
(let [parsed (parse-html html-data)
links (map (comp first #(html/attr-values % :href)) (html/select parsed [:a]))
links (remove nil? links)
links (map #(url/resolve base-url %) links)
links (map url/url-like links)
links (remove (fn [l] (not (#{"http" "https"} (.getProtocol l)))) links)
links (map #(.withoutFragment %) links)
links (map str links)]
links))
(defn repeat-str [chr n] (apply str (repeat n chr)))
(defn load-and-print
([max-depth url]
(load-and-print 0 max-depth url #{}))
([current-depth max-depth url seen]
(when (nil? (seen url))
(println (str (repeat-str ">" (inc current-depth)) " " url))
(if (< current-depth max-depth)
(let [html (grab url)
links (find-links html url)
seen (conj seen url)]
(doseq [next-url links] (load-and-print (inc current-depth) max-depth next-url seen)))))))
(defn -main [& args]
(let [depth 1
start-url "http://copy.sh/"]
;start-url "http://localhost/test.html"]
(load-and-print depth start-url)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment