Skip to content

Instantly share code, notes, and snippets.

@fmw
Created June 28, 2013 11:53
Show Gist options
  • Save fmw/5884161 to your computer and use it in GitHub Desktop.
Save fmw/5884161 to your computer and use it in GitHub Desktop.
(defn make-topology
[{:keys [seed-uris
max-depth
follow-links?-scoring-fn
follow-individual-link?-scoring-fn
store?-scoring-fn
content-processing-fn]}]
(doseq [uri seed-uris]
(queue/add-uri! (util/md5 (util/get-host uri)) uri 0))
(topology
;; spouts
{"1" (spout-spec uri-spout :p 3)}
;; bolts
{"2" (bolt-spec {"1" ["host"]}
request-page
:p 3)
"3" (bolt-spec {"2" ["host"]}
(follow-links?
max-depth
(or follow-links?-scoring-fn scoring/pass-all))
:p 3)
"4" (bolt-spec {"3" ["host"]}
extract-links
:p 3)
"5" (bolt-spec {"4" ["host"]}
(follow-individual-link?
(or follow-individual-link?-scoring-fn scoring/pass-all))
:p 3)
"6" (bolt-spec {"2" ["host"]}
(store?
max-depth
(or store?-scoring-fn scoring/pass-all))
:p 3)
"7" (bolt-spec {"6" ["host"]}
store
:p 3)
"8" (bolt-spec {"3" ["host"]}
(process-content content-processing-fn)
:p 3)}))
(defn crawl-remote!
"Crawl using AWS"
[job-name topology-config]
(StormSubmitter/submitTopology job-name
{TOPOLOGY-DEBUG true
TOPOLOGY-WORKERS 3}
(make-topology topology-config)))
(def invalid-storm-type-error
{:type ::invalid-storm-type-error
:message "The :storm :type configuration is invalid."})
(defn crawl!
[]
(let [storm-type (get-in-env :storm :type)
topology-name (get-in-env :storm :topology-name)
topology-config (get-in-env :crawler :topology-config)]
(cond
(= storm-type :in-process)
(do
(prn "Using in-process Storm cluster")
(crawl-in-process! topology-name topology-config))
(= storm-type :remote)
(do
(prn "Using a remote Storm cluster")
(crawl-remote! topology-name topology-config))
:default
(throw+ invalid-storm-type-error))))
(defn -main
[& args]
(let [parsed (cli args ["-e" "--environment" "Environment name"])
[params trailing-args doc-string] parsed
env (keyword (:environment params))]
(set-env! env)
(prn (str "Using environment " (get-env)))
(db/connect! (get-in-env :riak :cluster-hostnames))
(crawl!)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment