Skip to content

Instantly share code, notes, and snippets.

@ahxxm
Last active November 21, 2022 13:59
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save ahxxm/5b012985008708d96b22102258c1b90d to your computer and use it in GitHub Desktop.
title + <div id="info" /> => local redis
(ns doubanbook-cralwer.core
(:require [clj-http.client :as http]
[clojure.core.async :refer [go-loop chan <! >!! <!!] :as a]
[taoensso.carmine :as car :refer (wcar)])
(:gen-class))
(def ua "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36")
(def redis-opts {:pool {} :spec {:uri "redis://127.0.0.1:6379/0"}})
(defmacro wcar* [& body] `(car/wcar redis-opts ~@body))
(def seen (atom (into #{} (wcar* (car/keys "*")))))
(defn get-ua
[url]
(try
(http/get url {:headers {"User-Agent" ua} :throw-exceptions false :max-redirects 5})
(catch Exception e
(println (str "caught exception: " (.getMessage e)))
{})))
(defn extract-rsp
[rsp]
(let [body (or (:body rsp) "")
books (into #{} (re-seq #"(?i)(?<=book.douban.com/subject/)\d+" body))
title (clojure.string/replace
(or (re-find #"(?<=title\>)[\s\S]*?(?=</title>)" body) "")
"(豆瓣)" "")
info (re-find #"<div id=\"info\"[\s\S]*?</div>" body)
full (str (clojure.string/trim title) info)]
[full books]))
(defn go-crawl
[id-ch info-ch]
(loop [i 0]
(let [id (<!! id-ch)
url (str "https://book.douban.com/subject/" id "/")
before (quot (. System (nanoTime)) 1000000)
rsp (get-ua url)
after (quot (. System (nanoTime)) 1000000)
[info ids] (extract-rsp rsp)]
;;(<! (a/timeout 600))
(when (= 0 (mod i 100))
(println "GET book" id "finished in" (- after before) "ms"))
(>!! info-ch [id info])
(doseq [-id ids]
(when-not (@seen -id)
;; new id set nil for restart bootstrap
(wcar* (car/set id nil))
(>!! id-ch i)))
(swap! seen clojure.set/union ids)
(recur (inc i)))))
(defn go-write
[info-ch]
(go-loop []
(let [[id info] (<! info-ch)]
(if-not (wcar* (car/keys id))
(wcar* (car/set id info))
;; else only overwrite nil
(when info
(wcar* (car/set id info)))))
(recur)))
(defn -main
[]
(let [bookid-ch (chan 500000)
info-ch (chan 10000)]
(a/thread
(loop []
(<!! (a/timeout (* 15 1000)))
(println "seen urls" (count @seen) "book remaining" (count (.buf bookid-ch)))
(recur)))
;; bootstrap using keys with nil value
(loop [keys []
s (vec @seen)]
(let [k (first s)
r (rest s)]
(if-not k
(doseq [key keys]
(>!! bookid-ch key))
(if-not (empty? (wcar* (car/get k))) ;; else check if nippy/nil or ""
(recur keys r)
(recur (conj keys k) r)))))
;; n crawler
(doseq [_ (range 12)]
(a/thread (go-crawl bookid-ch info-ch)))
;; single redis writer
(<!! (go-write info-ch))))
@ahxxm
Copy link
Author

ahxxm commented Mar 9, 2020

第一个csv

import json
import csv
import redis
import re

r = redis.Redis(host='localhost', port=6379, db=0)

def link_to_isbn(link: str) -> str:
    i = re.findall("\d+", link)[0]
    msg = r.get(i)
    if not msg:
        return ""
    msg = msg.decode()
    isbn = re.findall("(?<=ISBN:</span> )\d+", msg)
    if isbn:
        return isbn[0]
    return ""

with open("books.csv") as f:
    c = f.readlines()
    
# read to: url[isbn], date, shelf, \n
isbn_added = dict()
isbn_shelf = dict()
result = []
for line in c[1:]:
    line = line.strip()
    splits = line.split(",")
    isbn = link_to_isbn(splits[0])
    if isbn:
        splits[0] = isbn
        isbn_added[isbn] = splits[1]
        isbn_shelf[isbn] = splits[2]
    result.append(splits)

# initial dump
with open("books-isbn.csv", 'w') as f:
   wr = csv.writer(f, quoting=csv.QUOTE_ALL)
   wr.writerow(["ISBN", "Date Added", "Bookshelves", "My Review"])
   wr.writerows(result)

@ahxxm
Copy link
Author

ahxxm commented Mar 9, 2020

导出books-remaining.csv,其中isbns为Goodreads没有的ISBN list,需要从网页上整理:

def extract_info(data):
    # Title, Author, ISBN,
    # Publisher, Binding, Year Published, Original Publication Year, Date Read, Date Added, Bookshelves, My Review
    title, remain = data.split("<div id=\"info")
    author = re.findall("(?<=作者</span>:)[\s\S]*?</span>", remain)
    if not author:
        author = re.findall("(?<=作者:</span>)[\s\S]*?</span>", remain)
    if not author:
        author = re.findall("(?<=译者:</span>)[\s\S]*?</span>", remain)
    if not author:
        author = re.findall("(?<=译者</span>:)[\s\S]*?</span>", remain)
    if not author:
        author = ["Anonymous"] #HACK
    
    author = author[0].strip()
    if "<a" in author:
        author = re.findall("(?<=>)[\s\S]*?(?=<)", author)[0].strip()
            
    isbn = re.findall("(?<=ISBN:</span> )\d+", remain)
    if not isbn:
        return None, None
    isbn = isbn[0]
    
    year = re.findall("(?<=出版年:</span>).*(?=<)", remain)
    if year:
        year = year[0].strip()
    
    publisher = re.findall("出版社.*</span>.*?(?=<)", remain)
    if publisher:
        publisher = publisher[0].split("</span>")[1].strip()
    else:
        publisher = ""

    return isbn, [author, title, year, publisher]

isbn_to_info = dict()
isbn_to_id = dict()
keys = r.keys()  # 所有爬下来的
for k in keys:
    data = r.get(k).decode()
    if data == "页面不存在" or data == "条目不存在":
        continue

    isbn, row = extract_info(data)
    if isbn:
        isbn_to_info[isbn] = row
        isbn_to_id[isbn] = k

# 用到了上面的 isbn_added 和 isbn_shelf
books = []
for isbn in isbns:
    isbn = str(isbn)
    shelf = isbn_shelf.get(isbn, "to-read")
    row = isbn_to_info.get(isbn)
    if not row:
        continue
    
    added = isbn_added.get(isbn, "")
    shelf = isbn_shelf.get(isbn, "currently-reading")
    author, title, date, publisher = row
    row = [isbn, added, shelf]+row
    books.append(row)

# remaining dump
with open("books-remaining.csv", 'w') as f:
    wr = csv.writer(f, quoting=csv.QUOTE_ALL)
    wr.writerow(["ISBN", "Date Added", "Bookshelves", "Author", "Publisher", "Title", "Year Published"])
    wr.writerows(books)```

@ahxxm
Copy link
Author

ahxxm commented Mar 9, 2020

创建新书:

```python
import requests
import re
import time

# Goodreads cookies里的这几个都需要
s = requests.Session()
s.cookies.set("ccsid", "")
s.cookies.set("u", "")
s.cookies.set("p", "")
s.cookies.set("_session_id2", "")
s.headers["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"

def create_one(isbn, author, title, publisher):
    rsp = s.get("https://www.goodreads.com/book/new")
    csrf = re.findall('(?<=csrf-token" content=").*(?="\ )', rsp.text)[0]
    pdata = {
        "utf-8": "✓",
        "authenticity_token": csrf,
        "book[title]": title,
        "book[sort_by_title]": title,
        "author[name]": author,
        "book[isbn]": isbn,
        "book[publisher]": publisher,
        "book[update_default_description]": 1,
        "work[media_type]": "book",
        "commit": "Create book",
    }
    rsp = s.post("https://www.goodreads.com/book/new", data=pdata)
    if title in rsp.text:
        print(title, "created at", rsp.url)

# csv_data是个list of list
for d in csv_data:
    isbn, date, _, author, title, date_create, publisher = d
    create_one(isbn, author, title, publisher)
    time.sleep(1)

@aleung
Copy link

aleung commented Apr 15, 2020

现在 https://www.goodreads.com/book/new 页面需要 reCAPTCHA,是不是没法用脚本添加书籍了?

@ahxxm
Copy link
Author

ahxxm commented Apr 16, 2020 via email

@githubcow
Copy link

现在没办法了么?

@ahxxm
Copy link
Author

ahxxm commented Nov 18, 2022

现在要librarian才能创建图书,不如去neodb,自动爬各处图书,维护也比较积极

@aleung
Copy link

aleung commented Nov 21, 2022

neodb真不错

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment