Skip to content

Instantly share code, notes, and snippets.

Last active November 21, 2022 13:59
Show Gist options
  • Save ahxxm/5b012985008708d96b22102258c1b90d to your computer and use it in GitHub Desktop.
Save ahxxm/5b012985008708d96b22102258c1b90d to your computer and use it in GitHub Desktop.
title + <div id="info" /> => local redis
(ns doubanbook-cralwer.core
(:require [clj-http.client :as http]
[clojure.core.async :refer [go-loop chan <! >!! <!!] :as a]
[taoensso.carmine :as car :refer (wcar)])
(def ua "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36")
(def redis-opts {:pool {} :spec {:uri "redis://"}})
(defmacro wcar* [& body] `(car/wcar redis-opts ~@body))
(def seen (atom (into #{} (wcar* (car/keys "*")))))
(defn get-ua
(http/get url {:headers {"User-Agent" ua} :throw-exceptions false :max-redirects 5})
(catch Exception e
(println (str "caught exception: " (.getMessage e)))
(defn extract-rsp
(let [body (or (:body rsp) "")
books (into #{} (re-seq #"(?i)(?<\d+" body))
title (clojure.string/replace
(or (re-find #"(?<=title\>)[\s\S]*?(?=</title>)" body) "")
"(豆瓣)" "")
info (re-find #"<div id=\"info\"[\s\S]*?</div>" body)
full (str (clojure.string/trim title) info)]
[full books]))
(defn go-crawl
[id-ch info-ch]
(loop [i 0]
(let [id (<!! id-ch)
url (str "" id "/")
before (quot (. System (nanoTime)) 1000000)
rsp (get-ua url)
after (quot (. System (nanoTime)) 1000000)
[info ids] (extract-rsp rsp)]
;;(<! (a/timeout 600))
(when (= 0 (mod i 100))
(println "GET book" id "finished in" (- after before) "ms"))
(>!! info-ch [id info])
(doseq [-id ids]
(when-not (@seen -id)
;; new id set nil for restart bootstrap
(wcar* (car/set id nil))
(>!! id-ch i)))
(swap! seen clojure.set/union ids)
(recur (inc i)))))
(defn go-write
(go-loop []
(let [[id info] (<! info-ch)]
(if-not (wcar* (car/keys id))
(wcar* (car/set id info))
;; else only overwrite nil
(when info
(wcar* (car/set id info)))))
(defn -main
(let [bookid-ch (chan 500000)
info-ch (chan 10000)]
(loop []
(<!! (a/timeout (* 15 1000)))
(println "seen urls" (count @seen) "book remaining" (count (.buf bookid-ch)))
;; bootstrap using keys with nil value
(loop [keys []
s (vec @seen)]
(let [k (first s)
r (rest s)]
(if-not k
(doseq [key keys]
(>!! bookid-ch key))
(if-not (empty? (wcar* (car/get k))) ;; else check if nippy/nil or ""
(recur keys r)
(recur (conj keys k) r)))))
;; n crawler
(doseq [_ (range 12)]
(a/thread (go-crawl bookid-ch info-ch)))
;; single redis writer
(<!! (go-write info-ch))))
Copy link

ahxxm commented Mar 9, 2020

导出books-remaining.csv,其中isbns为Goodreads没有的ISBN list,需要从网页上整理:

def extract_info(data):
    # Title, Author, ISBN,
    # Publisher, Binding, Year Published, Original Publication Year, Date Read, Date Added, Bookshelves, My Review
    title, remain = data.split("<div id=\"info")
    author = re.findall("(?<=作者</span>:)[\s\S]*?</span>", remain)
    if not author:
        author = re.findall("(?<=作者:</span>)[\s\S]*?</span>", remain)
    if not author:
        author = re.findall("(?<=译者:</span>)[\s\S]*?</span>", remain)
    if not author:
        author = re.findall("(?<=译者</span>:)[\s\S]*?</span>", remain)
    if not author:
        author = ["Anonymous"] #HACK
    author = author[0].strip()
    if "<a" in author:
        author = re.findall("(?<=>)[\s\S]*?(?=<)", author)[0].strip()
    isbn = re.findall("(?<=ISBN:</span> )\d+", remain)
    if not isbn:
        return None, None
    isbn = isbn[0]
    year = re.findall("(?<=出版年:</span>).*(?=<)", remain)
    if year:
        year = year[0].strip()
    publisher = re.findall("出版社.*</span>.*?(?=<)", remain)
    if publisher:
        publisher = publisher[0].split("</span>")[1].strip()
        publisher = ""

    return isbn, [author, title, year, publisher]

isbn_to_info = dict()
isbn_to_id = dict()
keys = r.keys()  # 所有爬下来的
for k in keys:
    data = r.get(k).decode()
    if data == "页面不存在" or data == "条目不存在":

    isbn, row = extract_info(data)
    if isbn:
        isbn_to_info[isbn] = row
        isbn_to_id[isbn] = k

# 用到了上面的 isbn_added 和 isbn_shelf
books = []
for isbn in isbns:
    isbn = str(isbn)
    shelf = isbn_shelf.get(isbn, "to-read")
    row = isbn_to_info.get(isbn)
    if not row:
    added = isbn_added.get(isbn, "")
    shelf = isbn_shelf.get(isbn, "currently-reading")
    author, title, date, publisher = row
    row = [isbn, added, shelf]+row

# remaining dump
with open("books-remaining.csv", 'w') as f:
    wr = csv.writer(f, quoting=csv.QUOTE_ALL)
    wr.writerow(["ISBN", "Date Added", "Bookshelves", "Author", "Publisher", "Title", "Year Published"])

Copy link

ahxxm commented Mar 9, 2020


import requests
import re
import time

# Goodreads cookies里的这几个都需要
s = requests.Session()
s.cookies.set("ccsid", "")
s.cookies.set("u", "")
s.cookies.set("p", "")
s.cookies.set("_session_id2", "")
s.headers["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"

def create_one(isbn, author, title, publisher):
    rsp = s.get("")
    csrf = re.findall('(?<=csrf-token" content=").*(?="\ )', rsp.text)[0]
    pdata = {
        "utf-8": "✓",
        "authenticity_token": csrf,
        "book[title]": title,
        "book[sort_by_title]": title,
        "author[name]": author,
        "book[isbn]": isbn,
        "book[publisher]": publisher,
        "book[update_default_description]": 1,
        "work[media_type]": "book",
        "commit": "Create book",
    rsp ="", data=pdata)
    if title in rsp.text:
        print(title, "created at", rsp.url)

# csv_data是个list of list
for d in csv_data:
    isbn, date, _, author, title, date_create, publisher = d
    create_one(isbn, author, title, publisher)

Copy link

aleung commented Apr 15, 2020

现在 页面需要 reCAPTCHA,是不是没法用脚本添加书籍了?

Copy link

ahxxm commented Apr 16, 2020 via email

Copy link

CAUJP commented Nov 18, 2022


Copy link

ahxxm commented Nov 18, 2022


Copy link

aleung commented Nov 21, 2022


Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment