ahxxm/doubanbook-cralwer.clj

## doubanbook-cralwer.clj
(ns doubanbook-cralwer.core
  (:require [clj-http.client :as http]
            [clojure.core.async :refer [go-loop chan <! >!! <!!] :as a]
            [taoensso.carmine :as car :refer (wcar)])
  (:gen-class))

(def ua "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36")
(def redis-opts {:pool {} :spec {:uri "redis://127.0.0.1:6379/0"}})
(defmacro wcar* [& body] `(car/wcar redis-opts ~@body))
(def seen (atom (into #{} (wcar* (car/keys "*")))))

(defn get-ua
  [url]
  (try
    (http/get url {:headers {"User-Agent" ua} :throw-exceptions false :max-redirects 5})
    (catch Exception e
      (println (str "caught exception: " (.getMessage e)))
      {})))

(defn extract-rsp
  [rsp]
  (let [body  (or (:body rsp) "")
        books (into #{} (re-seq #"(?i)(?<=book.douban.com/subject/)\d+" body))
        title (clojure.string/replace
               (or (re-find #"(?<=title\>)[\s\S]*?(?=</title>)" body) "")
               "(豆瓣)" "")
        info  (re-find #"<div id=\"info\"[\s\S]*?</div>" body)
        full  (str (clojure.string/trim title) info)]
    [full books]))

(defn go-crawl
  [id-ch info-ch]
  (loop [i 0]
    (let [id     (<!! id-ch)
          url    (str "https://book.douban.com/subject/" id "/")
          before (quot (. System (nanoTime)) 1000000)
          rsp    (get-ua url)
          after  (quot (. System (nanoTime)) 1000000)
          [info ids] (extract-rsp rsp)]
      ;;(<! (a/timeout 600))
      (when (= 0 (mod i 100))
        (println "GET book" id "finished in" (- after before) "ms"))

      (>!! info-ch [id info])
      (doseq [-id ids]
        (when-not (@seen -id)
          ;; new id set nil for restart bootstrap
          (wcar* (car/set id nil))
          (>!! id-ch i)))

      (swap! seen clojure.set/union ids)
      (recur (inc i)))))

(defn go-write
  [info-ch]
  (go-loop []
    (let [[id info] (<! info-ch)]
      (if-not (wcar* (car/keys id))
        (wcar* (car/set id info))
        ;; else only overwrite nil
        (when info
          (wcar* (car/set id info)))))
    (recur)))

(defn -main
  []
  (let [bookid-ch (chan 500000)
        info-ch   (chan 10000)]
    (a/thread
      (loop []
        (<!! (a/timeout (* 15 1000)))
        (println "seen urls" (count @seen) "book remaining" (count (.buf bookid-ch)))
        (recur)))

    ;; bootstrap using keys with nil value
    (loop [keys []
           s (vec @seen)]
      (let [k (first s)
            r (rest s)]
        (if-not k
          (doseq [key keys]
            (>!! bookid-ch key))
          (if-not (empty? (wcar* (car/get k))) ;; else check if nippy/nil or ""
            (recur keys r)
            (recur (conj keys k) r)))))

    ;; n crawler
    (doseq [_ (range 12)]
      (a/thread (go-crawl bookid-ch info-ch)))

    ;; single redis writer
    (<!! (go-write info-ch))))
	(ns doubanbook-cralwer.core
	(:require [clj-http.client :as http]
	[clojure.core.async :refer [go-loop chan <! >!! <!!] :as a]
	[taoensso.carmine :as car :refer (wcar)])
	(:gen-class))

	(def ua "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36")
	(def redis-opts {:pool {} :spec {:uri "redis://127.0.0.1:6379/0"}})
	(defmacro wcar* [& body] `(car/wcar redis-opts ~@body))
	(def seen (atom (into #{} (wcar* (car/keys "*")))))

	(defn get-ua
	[url]
	(try
	(http/get url {:headers {"User-Agent" ua} :throw-exceptions false :max-redirects 5})
	(catch Exception e
	(println (str "caught exception: " (.getMessage e)))
	{})))

	(defn extract-rsp
	[rsp]
	(let [body (or (:body rsp) "")
	books (into #{} (re-seq #"(?i)(?<=book.douban.com/subject/)\d+" body))
	title (clojure.string/replace
	(or (re-find #"(?<=title\>)[\s\S]*?(?=</title>)" body) "")
	"(豆瓣)" "")
	info (re-find #"<div id=\"info\"[\s\S]*?</div>" body)
	full (str (clojure.string/trim title) info)]
	[full books]))

	(defn go-crawl
	[id-ch info-ch]
	(loop [i 0]
	(let [id (<!! id-ch)
	url (str "https://book.douban.com/subject/" id "/")
	before (quot (. System (nanoTime)) 1000000)
	rsp (get-ua url)
	after (quot (. System (nanoTime)) 1000000)
	[info ids] (extract-rsp rsp)]
	;;(<! (a/timeout 600))
	(when (= 0 (mod i 100))
	(println "GET book" id "finished in" (- after before) "ms"))

	(>!! info-ch [id info])
	(doseq [-id ids]
	(when-not (@seen -id)
	;; new id set nil for restart bootstrap
	(wcar* (car/set id nil))
	(>!! id-ch i)))

	(swap! seen clojure.set/union ids)
	(recur (inc i)))))

	(defn go-write
	[info-ch]
	(go-loop []
	(let [[id info] (<! info-ch)]
	(if-not (wcar* (car/keys id))
	(wcar* (car/set id info))
	;; else only overwrite nil
	(when info
	(wcar* (car/set id info)))))
	(recur)))

	(defn -main
	[]
	(let [bookid-ch (chan 500000)
	info-ch (chan 10000)]
	(a/thread
	(loop []
	(<!! (a/timeout (* 15 1000)))
	(println "seen urls" (count @seen) "book remaining" (count (.buf bookid-ch)))
	(recur)))

	;; bootstrap using keys with nil value
	(loop [keys []
	s (vec @seen)]
	(let [k (first s)
	r (rest s)]
	(if-not k
	(doseq [key keys]
	(>!! bookid-ch key))
	(if-not (empty? (wcar* (car/get k))) ;; else check if nippy/nil or ""
	(recur keys r)
	(recur (conj keys k) r)))))

	;; n crawler
	(doseq [_ (range 12)]
	(a/thread (go-crawl bookid-ch info-ch)))

	;; single redis writer
	(<!! (go-write info-ch))))