Forked from tmountain/crawler.clj
Created June 14, 2019 14:14
(ns clj-sleuth.crawl
(:require [jsoup.soup :as c]
[clojure.pprint :as p]
[ :as log]))
(reify Thread$UncaughtExceptionHandler
(uncaughtException [_ thread ex]
(log/error ex "Uncaught exception on" (.getName thread)))))
(defn prepend-http
(str "http://" domain))
(defn url-to-file
(-> (clojure.string/replace url #"https?://" "")
(clojure.string/replace #"[.]" "_")))
(defn get-url
{:url site
:data (try
(-> (c/get! (prepend-http site) :timeout 5000 :ignore-http-errors true)
(c/$ (.text)))
(catch Exception e (str "Caught exception: " (.getMessage e))))})
(defn do-result
[outpath result]
(if (and (map? result)
(contains? result :url)
(contains? result :data))
(spit (str outpath (url-to-file (:url result)))
(:data result))))
(defn file-exists?
(.exists ( filepath)))
(defn make-dispatcher
(fn [domain]
(let [file (str outpath (url-to-file domain))]
(if (file-exists? file)
(->> (get-url domain)
(do-result outpath))))))
(defn run-crawl
(let [bucket (:bucket data)
outpath (:outpath data)
dispatcher (make-dispatcher outpath)]
(doall (map dispatcher bucket))))
(defn -main
[& args]
(with-open [rdr ( "data/domains_to_crawl.txt")]
(let [outpath "data/crawl_results/"
num-agents 256
urls (map clojure.string/trim (line-seq rdr))
num-items (count urls)
bucket-size (int (/ num-items num-agents))
buckets (partition bucket-size bucket-size [] urls)
agents (map #(agent {:bucket %, :outpath outpath}) buckets)]
(doall (map #(send-off % run-crawl) agents))
(apply await agents)
