Created
July 12, 2020 01:07
-
-
Save drewverlee/8a89c5c28280b1c0775405429e6f0e0c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns drewverlee.webcrawler | |
(:require [clojure.core.async | |
:as a | |
:refer [>! <! >!! <!! go chan buffer close! thread | |
onto-chan | |
alts! alts!!]])) | |
;; Note in both solutions we mock the web crawl | |
;; my version | |
;; closes via timeout. This would be ideal if the urls were growing. But here | |
;; it's likely unnecessary. | |
(let [url->urls {:a [:b :e] | |
:c [:a] | |
:d [:e]} | |
urls (chan 10) | |
timeout (a/timeout 1000)] | |
(onto-chan urls (keys url->urls) false) | |
(loop [[url _] (alts!! [urls timeout]) | |
seen? #{}] | |
(cond | |
(nil? url) (println "crawled: " seen?) | |
(seen? url) (recur (alts!! [urls timeout]) seen?) | |
:else | |
(do | |
(println "crawl: " url) | |
(onto-chan urls (url->urls url) false) | |
(recur | |
(alts!! [urls timeout]) | |
(conj seen? url)))))) | |
;; Andrian Smith's version | |
;; handles url webcrawl in real thread and prodcuers and consumers in green/go threads | |
;; which is likely ideal. | |
(let [url->urls {:a [:b] | |
:b [:d] | |
:d [:a]} | |
to-download (chan 10) | |
downloaded (chan 10)] | |
(go | |
(loop [] | |
(when-let [url (<! to-download)] | |
(>! downloaded [url | |
(<! (thread (get url->urls url [])))]) | |
(recur)))) | |
(go | |
(loop [to-visit (set (keys url->urls)) | |
pending? #{} | |
seen? #{}] | |
(prn to-visit pending? seen?) | |
(if (and (empty? pending?) | |
(empty? to-visit)) | |
seen? | |
(let [next-url (first to-visit) | |
ports (if next-url | |
[[to-download next-url] | |
downloaded] | |
[downloaded]) | |
[val port] (alts! ports)] | |
(cond | |
(= port to-download) | |
(recur (disj to-visit next-url) | |
(conj pending? next-url) | |
seen?) | |
(= port downloaded) | |
(let [[from-url to-urls] val] | |
(recur (into to-visit (remove #(or (pending? %) | |
(seen? %))) | |
to-urls) | |
(disj pending? from-url) | |
(conj seen? from-url))))))))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment