Skip to content

Instantly share code, notes, and snippets.

@ahxxm
Last active November 21, 2022 13:59
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ahxxm/5b012985008708d96b22102258c1b90d to your computer and use it in GitHub Desktop.
Save ahxxm/5b012985008708d96b22102258c1b90d to your computer and use it in GitHub Desktop.
title + <div id="info" /> => local redis
(ns doubanbook-cralwer.core
(:require [clj-http.client :as http]
[clojure.core.async :refer [go-loop chan <! >!! <!!] :as a]
[taoensso.carmine :as car :refer (wcar)])
(:gen-class))
(def ua "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36")
(def redis-opts {:pool {} :spec {:uri "redis://127.0.0.1:6379/0"}})
(defmacro wcar* [& body] `(car/wcar redis-opts ~@body))
(def seen (atom (into #{} (wcar* (car/keys "*")))))
(defn get-ua
[url]
(try
(http/get url {:headers {"User-Agent" ua} :throw-exceptions false :max-redirects 5})
(catch Exception e
(println (str "caught exception: " (.getMessage e)))
{})))
(defn extract-rsp
[rsp]
(let [body (or (:body rsp) "")
books (into #{} (re-seq #"(?i)(?<=book.douban.com/subject/)\d+" body))
title (clojure.string/replace
(or (re-find #"(?<=title\>)[\s\S]*?(?=</title>)" body) "")
"(豆瓣)" "")
info (re-find #"<div id=\"info\"[\s\S]*?</div>" body)
full (str (clojure.string/trim title) info)]
[full books]))
(defn go-crawl
[id-ch info-ch]
(loop [i 0]
(let [id (<!! id-ch)
url (str "https://book.douban.com/subject/" id "/")
before (quot (. System (nanoTime)) 1000000)
rsp (get-ua url)
after (quot (. System (nanoTime)) 1000000)
[info ids] (extract-rsp rsp)]
;;(<! (a/timeout 600))
(when (= 0 (mod i 100))
(println "GET book" id "finished in" (- after before) "ms"))
(>!! info-ch [id info])
(doseq [-id ids]
(when-not (@seen -id)
;; new id set nil for restart bootstrap
(wcar* (car/set id nil))
(>!! id-ch i)))
(swap! seen clojure.set/union ids)
(recur (inc i)))))
(defn go-write
[info-ch]
(go-loop []
(let [[id info] (<! info-ch)]
(if-not (wcar* (car/keys id))
(wcar* (car/set id info))
;; else only overwrite nil
(when info
(wcar* (car/set id info)))))
(recur)))
(defn -main
[]
(let [bookid-ch (chan 500000)
info-ch (chan 10000)]
(a/thread
(loop []
(<!! (a/timeout (* 15 1000)))
(println "seen urls" (count @seen) "book remaining" (count (.buf bookid-ch)))
(recur)))
;; bootstrap using keys with nil value
(loop [keys []
s (vec @seen)]
(let [k (first s)
r (rest s)]
(if-not k
(doseq [key keys]
(>!! bookid-ch key))
(if-not (empty? (wcar* (car/get k))) ;; else check if nippy/nil or ""
(recur keys r)
(recur (conj keys k) r)))))
;; n crawler
(doseq [_ (range 12)]
(a/thread (go-crawl bookid-ch info-ch)))
;; single redis writer
(<!! (go-write info-ch))))
@ahxxm
Copy link
Author

ahxxm commented Mar 9, 2020

创建新书:

```python
import requests
import re
import time

# Goodreads cookies里的这几个都需要
s = requests.Session()
s.cookies.set("ccsid", "")
s.cookies.set("u", "")
s.cookies.set("p", "")
s.cookies.set("_session_id2", "")
s.headers["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"

def create_one(isbn, author, title, publisher):
    rsp = s.get("https://www.goodreads.com/book/new")
    csrf = re.findall('(?<=csrf-token" content=").*(?="\ )', rsp.text)[0]
    pdata = {
        "utf-8": "✓",
        "authenticity_token": csrf,
        "book[title]": title,
        "book[sort_by_title]": title,
        "author[name]": author,
        "book[isbn]": isbn,
        "book[publisher]": publisher,
        "book[update_default_description]": 1,
        "work[media_type]": "book",
        "commit": "Create book",
    }
    rsp = s.post("https://www.goodreads.com/book/new", data=pdata)
    if title in rsp.text:
        print(title, "created at", rsp.url)

# csv_data是个list of list
for d in csv_data:
    isbn, date, _, author, title, date_create, publisher = d
    create_one(isbn, author, title, publisher)
    time.sleep(1)

@aleung
Copy link

aleung commented Apr 15, 2020

现在 https://www.goodreads.com/book/new 页面需要 reCAPTCHA,是不是没法用脚本添加书籍了?

@ahxxm
Copy link
Author

ahxxm commented Apr 16, 2020 via email

@githubcow
Copy link

现在没办法了么?

@ahxxm
Copy link
Author

ahxxm commented Nov 18, 2022

现在要librarian才能创建图书,不如去neodb,自动爬各处图书,维护也比较积极

@aleung
Copy link

aleung commented Nov 21, 2022

neodb真不错

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment