Last active
November 21, 2022 13:59
-
-
Save ahxxm/5b012985008708d96b22102258c1b90d to your computer and use it in GitHub Desktop.
title + <div id="info" /> => local redis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns doubanbook-cralwer.core | |
(:require [clj-http.client :as http] | |
[clojure.core.async :refer [go-loop chan <! >!! <!!] :as a] | |
[taoensso.carmine :as car :refer (wcar)]) | |
(:gen-class)) | |
(def ua "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36") | |
(def redis-opts {:pool {} :spec {:uri "redis://127.0.0.1:6379/0"}}) | |
(defmacro wcar* [& body] `(car/wcar redis-opts ~@body)) | |
(def seen (atom (into #{} (wcar* (car/keys "*"))))) | |
(defn get-ua | |
[url] | |
(try | |
(http/get url {:headers {"User-Agent" ua} :throw-exceptions false :max-redirects 5}) | |
(catch Exception e | |
(println (str "caught exception: " (.getMessage e))) | |
{}))) | |
(defn extract-rsp | |
[rsp] | |
(let [body (or (:body rsp) "") | |
books (into #{} (re-seq #"(?i)(?<=book.douban.com/subject/)\d+" body)) | |
title (clojure.string/replace | |
(or (re-find #"(?<=title\>)[\s\S]*?(?=</title>)" body) "") | |
"(豆瓣)" "") | |
info (re-find #"<div id=\"info\"[\s\S]*?</div>" body) | |
full (str (clojure.string/trim title) info)] | |
[full books])) | |
(defn go-crawl | |
[id-ch info-ch] | |
(loop [i 0] | |
(let [id (<!! id-ch) | |
url (str "https://book.douban.com/subject/" id "/") | |
before (quot (. System (nanoTime)) 1000000) | |
rsp (get-ua url) | |
after (quot (. System (nanoTime)) 1000000) | |
[info ids] (extract-rsp rsp)] | |
;;(<! (a/timeout 600)) | |
(when (= 0 (mod i 100)) | |
(println "GET book" id "finished in" (- after before) "ms")) | |
(>!! info-ch [id info]) | |
(doseq [-id ids] | |
(when-not (@seen -id) | |
;; new id set nil for restart bootstrap | |
(wcar* (car/set id nil)) | |
(>!! id-ch i))) | |
(swap! seen clojure.set/union ids) | |
(recur (inc i))))) | |
(defn go-write | |
[info-ch] | |
(go-loop [] | |
(let [[id info] (<! info-ch)] | |
(if-not (wcar* (car/keys id)) | |
(wcar* (car/set id info)) | |
;; else only overwrite nil | |
(when info | |
(wcar* (car/set id info))))) | |
(recur))) | |
(defn -main | |
[] | |
(let [bookid-ch (chan 500000) | |
info-ch (chan 10000)] | |
(a/thread | |
(loop [] | |
(<!! (a/timeout (* 15 1000))) | |
(println "seen urls" (count @seen) "book remaining" (count (.buf bookid-ch))) | |
(recur))) | |
;; bootstrap using keys with nil value | |
(loop [keys [] | |
s (vec @seen)] | |
(let [k (first s) | |
r (rest s)] | |
(if-not k | |
(doseq [key keys] | |
(>!! bookid-ch key)) | |
(if-not (empty? (wcar* (car/get k))) ;; else check if nippy/nil or "" | |
(recur keys r) | |
(recur (conj keys k) r))))) | |
;; n crawler | |
(doseq [_ (range 12)] | |
(a/thread (go-crawl bookid-ch info-ch))) | |
;; single redis writer | |
(<!! (go-write info-ch)))) |
现在要librarian才能创建图书,不如去neodb,自动爬各处图书,维护也比较积极
neodb真不错
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
现在没办法了么?