Skip to content

Instantly share code, notes, and snippets.

@ahxxm
Last active November 21, 2022 13:59
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ahxxm/5b012985008708d96b22102258c1b90d to your computer and use it in GitHub Desktop.
Save ahxxm/5b012985008708d96b22102258c1b90d to your computer and use it in GitHub Desktop.
title + <div id="info" /> => local redis
(ns doubanbook-cralwer.core
(:require [clj-http.client :as http]
[clojure.core.async :refer [go-loop chan <! >!! <!!] :as a]
[taoensso.carmine :as car :refer (wcar)])
(:gen-class))
(def ua "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36")
(def redis-opts {:pool {} :spec {:uri "redis://127.0.0.1:6379/0"}})
(defmacro wcar* [& body] `(car/wcar redis-opts ~@body))
(def seen (atom (into #{} (wcar* (car/keys "*")))))
(defn get-ua
[url]
(try
(http/get url {:headers {"User-Agent" ua} :throw-exceptions false :max-redirects 5})
(catch Exception e
(println (str "caught exception: " (.getMessage e)))
{})))
(defn extract-rsp
[rsp]
(let [body (or (:body rsp) "")
books (into #{} (re-seq #"(?i)(?<=book.douban.com/subject/)\d+" body))
title (clojure.string/replace
(or (re-find #"(?<=title\>)[\s\S]*?(?=</title>)" body) "")
"(豆瓣)" "")
info (re-find #"<div id=\"info\"[\s\S]*?</div>" body)
full (str (clojure.string/trim title) info)]
[full books]))
(defn go-crawl
[id-ch info-ch]
(loop [i 0]
(let [id (<!! id-ch)
url (str "https://book.douban.com/subject/" id "/")
before (quot (. System (nanoTime)) 1000000)
rsp (get-ua url)
after (quot (. System (nanoTime)) 1000000)
[info ids] (extract-rsp rsp)]
;;(<! (a/timeout 600))
(when (= 0 (mod i 100))
(println "GET book" id "finished in" (- after before) "ms"))
(>!! info-ch [id info])
(doseq [-id ids]
(when-not (@seen -id)
;; new id set nil for restart bootstrap
(wcar* (car/set id nil))
(>!! id-ch i)))
(swap! seen clojure.set/union ids)
(recur (inc i)))))
(defn go-write
[info-ch]
(go-loop []
(let [[id info] (<! info-ch)]
(if-not (wcar* (car/keys id))
(wcar* (car/set id info))
;; else only overwrite nil
(when info
(wcar* (car/set id info)))))
(recur)))
(defn -main
[]
(let [bookid-ch (chan 500000)
info-ch (chan 10000)]
(a/thread
(loop []
(<!! (a/timeout (* 15 1000)))
(println "seen urls" (count @seen) "book remaining" (count (.buf bookid-ch)))
(recur)))
;; bootstrap using keys with nil value
(loop [keys []
s (vec @seen)]
(let [k (first s)
r (rest s)]
(if-not k
(doseq [key keys]
(>!! bookid-ch key))
(if-not (empty? (wcar* (car/get k))) ;; else check if nippy/nil or ""
(recur keys r)
(recur (conj keys k) r)))))
;; n crawler
(doseq [_ (range 12)]
(a/thread (go-crawl bookid-ch info-ch)))
;; single redis writer
(<!! (go-write info-ch))))
@aleung
Copy link

aleung commented Nov 21, 2022

neodb真不错

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment