Skip to content

Instantly share code, notes, and snippets.

Last active November 21, 2022 13:59
Show Gist options
  • Save ahxxm/5b012985008708d96b22102258c1b90d to your computer and use it in GitHub Desktop.
Save ahxxm/5b012985008708d96b22102258c1b90d to your computer and use it in GitHub Desktop.
title + <div id="info" /> => local redis
(ns doubanbook-cralwer.core
(:require [clj-http.client :as http]
[clojure.core.async :refer [go-loop chan <! >!! <!!] :as a]
[taoensso.carmine :as car :refer (wcar)])
(def ua "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36")
(def redis-opts {:pool {} :spec {:uri "redis://"}})
(defmacro wcar* [& body] `(car/wcar redis-opts ~@body))
(def seen (atom (into #{} (wcar* (car/keys "*")))))
(defn get-ua
(http/get url {:headers {"User-Agent" ua} :throw-exceptions false :max-redirects 5})
(catch Exception e
(println (str "caught exception: " (.getMessage e)))
(defn extract-rsp
(let [body (or (:body rsp) "")
books (into #{} (re-seq #"(?i)(?<\d+" body))
title (clojure.string/replace
(or (re-find #"(?<=title\>)[\s\S]*?(?=</title>)" body) "")
"(豆瓣)" "")
info (re-find #"<div id=\"info\"[\s\S]*?</div>" body)
full (str (clojure.string/trim title) info)]
[full books]))
(defn go-crawl
[id-ch info-ch]
(loop [i 0]
(let [id (<!! id-ch)
url (str "" id "/")
before (quot (. System (nanoTime)) 1000000)
rsp (get-ua url)
after (quot (. System (nanoTime)) 1000000)
[info ids] (extract-rsp rsp)]
;;(<! (a/timeout 600))
(when (= 0 (mod i 100))
(println "GET book" id "finished in" (- after before) "ms"))
(>!! info-ch [id info])
(doseq [-id ids]
(when-not (@seen -id)
;; new id set nil for restart bootstrap
(wcar* (car/set id nil))
(>!! id-ch i)))
(swap! seen clojure.set/union ids)
(recur (inc i)))))
(defn go-write
(go-loop []
(let [[id info] (<! info-ch)]
(if-not (wcar* (car/keys id))
(wcar* (car/set id info))
;; else only overwrite nil
(when info
(wcar* (car/set id info)))))
(defn -main
(let [bookid-ch (chan 500000)
info-ch (chan 10000)]
(loop []
(<!! (a/timeout (* 15 1000)))
(println "seen urls" (count @seen) "book remaining" (count (.buf bookid-ch)))
;; bootstrap using keys with nil value
(loop [keys []
s (vec @seen)]
(let [k (first s)
r (rest s)]
(if-not k
(doseq [key keys]
(>!! bookid-ch key))
(if-not (empty? (wcar* (car/get k))) ;; else check if nippy/nil or ""
(recur keys r)
(recur (conj keys k) r)))))
;; n crawler
(doseq [_ (range 12)]
(a/thread (go-crawl bookid-ch info-ch)))
;; single redis writer
(<!! (go-write info-ch))))
Copy link

ahxxm commented Mar 9, 2020


import requests
import re
import time

# Goodreads cookies里的这几个都需要
s = requests.Session()
s.cookies.set("ccsid", "")
s.cookies.set("u", "")
s.cookies.set("p", "")
s.cookies.set("_session_id2", "")
s.headers["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"

def create_one(isbn, author, title, publisher):
    rsp = s.get("")
    csrf = re.findall('(?<=csrf-token" content=").*(?="\ )', rsp.text)[0]
    pdata = {
        "utf-8": "✓",
        "authenticity_token": csrf,
        "book[title]": title,
        "book[sort_by_title]": title,
        "author[name]": author,
        "book[isbn]": isbn,
        "book[publisher]": publisher,
        "book[update_default_description]": 1,
        "work[media_type]": "book",
        "commit": "Create book",
    rsp ="", data=pdata)
    if title in rsp.text:
        print(title, "created at", rsp.url)

# csv_data是个list of list
for d in csv_data:
    isbn, date, _, author, title, date_create, publisher = d
    create_one(isbn, author, title, publisher)

Copy link

aleung commented Apr 15, 2020

现在 页面需要 reCAPTCHA,是不是没法用脚本添加书籍了?

Copy link

ahxxm commented Apr 16, 2020 via email

Copy link

CAUJP commented Nov 18, 2022


Copy link

ahxxm commented Nov 18, 2022


Copy link

aleung commented Nov 21, 2022


Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment