Last active November 21, 2022 13:59
title + <div id="info" /> => local redis
(ns doubanbook-cralwer.core
(:require [clj-http.client :as http]
[clojure.core.async :refer [go-loop chan <! >!! <!!] :as a]
[taoensso.carmine :as car :refer (wcar)])
(def ua "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36")
(def redis-opts {:pool {} :spec {:uri "redis://"}})
(defmacro wcar* [& body] `(car/wcar redis-opts ~@body))
(def seen (atom (into #{} (wcar* (car/keys "*")))))
(defn get-ua
(http/get url {:headers {"User-Agent" ua} :throw-exceptions false :max-redirects 5})
(catch Exception e
(println (str "caught exception: " (.getMessage e)))
(defn extract-rsp
(let [body (or (:body rsp) "")
books (into #{} (re-seq #"(?i)(?<\d+" body))
title (clojure.string/replace
(or (re-find #"(?<=title\>)[\s\S]*?(?=</title>)" body) "")
"(豆瓣)" "")
info (re-find #"<div id=\"info\"[\s\S]*?</div>" body)
full (str (clojure.string/trim title) info)]
[full books]))
(defn go-crawl
[id-ch info-ch]
(loop [i 0]
(let [id (<!! id-ch)
url (str "" id "/")
before (quot (. System (nanoTime)) 1000000)
rsp (get-ua url)
after (quot (. System (nanoTime)) 1000000)
[info ids] (extract-rsp rsp)]
;;(<! (a/timeout 600))
(when (= 0 (mod i 100))
(println "GET book" id "finished in" (- after before) "ms"))
(>!! info-ch [id info])
(doseq [-id ids]
(when-not (@seen -id)
;; new id set nil for restart bootstrap
(wcar* (car/set id nil))
(>!! id-ch i)))
(swap! seen clojure.set/union ids)
(recur (inc i)))))
(defn go-write
(go-loop []
(let [[id info] (<! info-ch)]
(if-not (wcar* (car/keys id))
(wcar* (car/set id info))
;; else only overwrite nil
(when info
(wcar* (car/set id info)))))
(defn -main
(let [bookid-ch (chan 500000)
info-ch (chan 10000)]
(loop []
(<!! (a/timeout (* 15 1000)))
(println "seen urls" (count @seen) "book remaining" (count (.buf bookid-ch)))
;; bootstrap using keys with nil value
(loop [keys []
s (vec @seen)]
(let [k (first s)
r (rest s)]
(if-not k
(doseq [key keys]
(>!! bookid-ch key))
(if-not (empty? (wcar* (car/get k))) ;; else check if nippy/nil or ""
(recur keys r)
(recur (conj keys k) r)))))
;; n crawler
(doseq [_ (range 12)]
(a/thread (go-crawl bookid-ch info-ch)))
;; single redis writer
(<!! (go-write info-ch))))
ahxxm commented Mar 9, 2020


import requests
import re
import time

# Goodreads cookies里的这几个都需要
s = requests.Session()
s.cookies.set("ccsid", "")
s.cookies.set("u", "")
s.cookies.set("p", "")
s.cookies.set("_session_id2", "")
s.headers["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"

def create_one(isbn, author, title, publisher):
    rsp = s.get("")
    csrf = re.findall('(?<=csrf-token" content=").*(?="\ )', rsp.text)[0]
    pdata = {
        "utf-8": "✓",
        "authenticity_token": csrf,
        "book[title]": title,
        "book[sort_by_title]": title,
        "author[name]": author,
        "book[isbn]": isbn,
        "book[publisher]": publisher,
        "book[update_default_description]": 1,
        "work[media_type]": "book",
        "commit": "Create book",
    rsp ="", data=pdata)
    if title in rsp.text:
        print(title, "created at", rsp.url)

# csv_data是个list of list
for d in csv_data:
    isbn, date, _, author, title, date_create, publisher = d
    create_one(isbn, author, title, publisher)

aleung commented Apr 15, 2020

现在 页面需要 reCAPTCHA,是不是没法用脚本添加书籍了?

ahxxm commented Apr 16, 2020 via email

CAUJP commented Nov 18, 2022


ahxxm commented Nov 18, 2022


aleung commented Nov 21, 2022


