Skip to content

Instantly share code, notes, and snippets.

@tmountain
Created September 1, 2017 13:21
Show Gist options
  • Save tmountain/04937a84e299730b814b194616e44330 to your computer and use it in GitHub Desktop.
Save tmountain/04937a84e299730b814b194616e44330 to your computer and use it in GitHub Desktop.
(ns clj-sleuth.crawl
(:require [jsoup.soup :as c]
[clojure.pprint :as p]
[clojure.tools.logging :as log]))
(Thread/setDefaultUncaughtExceptionHandler
(reify Thread$UncaughtExceptionHandler
(uncaughtException [_ thread ex]
(log/error ex "Uncaught exception on" (.getName thread)))))
(defn prepend-http
[domain]
(str "http://" domain))
(defn url-to-file
[url]
(-> (clojure.string/replace url #"https?://" "")
(clojure.string/replace #"[.]" "_")))
(defn get-url
[site]
{:url site
:data (try
(-> (c/get! (prepend-http site) :timeout 5000 :ignore-http-errors true)
(c/$ (.text)))
(catch Exception e (str "Caught exception: " (.getMessage e))))})
(defn do-result
[outpath result]
(if (and (map? result)
(contains? result :url)
(contains? result :data))
(spit (str outpath (url-to-file (:url result)))
(:data result))))
(defn file-exists?
[filepath]
(.exists (clojure.java.io/as-file filepath)))
(defn make-dispatcher
[outpath]
(fn [domain]
(let [file (str outpath (url-to-file domain))]
(if (file-exists? file)
nil
(->> (get-url domain)
(do-result outpath))))))
(defn run-crawl
[data]
(let [bucket (:bucket data)
outpath (:outpath data)
dispatcher (make-dispatcher outpath)]
(doall (map dispatcher bucket))))
(defn -main
[& args]
(with-open [rdr (clojure.java.io/reader "data/domains_to_crawl.txt")]
(let [outpath "data/crawl_results/"
num-agents 256
urls (map clojure.string/trim (line-seq rdr))
num-items (count urls)
bucket-size (int (/ num-items num-agents))
buckets (partition bucket-size bucket-size [] urls)
agents (map #(agent {:bucket %, :outpath outpath}) buckets)]
(doall (map #(send-off % run-crawl) agents))
(apply await agents)
(shutdown-agents))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment