title + <div id="info" /> => local redis
(ns doubanbook-cralwer.core | |
(:require [clj-http.client :as http] | |
[clojure.core.async :refer [go-loop chan <! >!! <!!] :as a] | |
[taoensso.carmine :as car :refer (wcar)]) | |
(:gen-class)) | |
(def ua "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36") | |
(def redis-opts {:pool {} :spec {:uri "redis://127.0.0.1:6379/0"}}) | |
(defmacro wcar* [& body] `(car/wcar redis-opts ~@body)) | |
(def seen (atom (into #{} (wcar* (car/keys "*"))))) | |
(defn get-ua | |
[url] | |
(try | |
(http/get url {:headers {"User-Agent" ua} :throw-exceptions false :max-redirects 5}) | |
(catch Exception e | |
(println (str "caught exception: " (.getMessage e))) | |
{}))) | |
(defn extract-rsp | |
[rsp] | |
(let [body (or (:body rsp) "") | |
books (into #{} (re-seq #"(?i)(?<=book.douban.com/subject/)\d+" body)) | |
title (clojure.string/replace | |
(or (re-find #"(?<=title\>)[\s\S]*?(?=</title>)" body) "") | |
"(豆瓣)" "") | |
info (re-find #"<div id=\"info\"[\s\S]*?</div>" body) | |
full (str (clojure.string/trim title) info)] | |
[full books])) | |
(defn go-crawl | |
[id-ch info-ch] | |
(loop [i 0] | |
(let [id (<!! id-ch) | |
url (str "https://book.douban.com/subject/" id "/") | |
before (quot (. System (nanoTime)) 1000000) | |
rsp (get-ua url) | |
after (quot (. System (nanoTime)) 1000000) | |
[info ids] (extract-rsp rsp)] | |
;;(<! (a/timeout 600)) | |
(when (= 0 (mod i 100)) | |
(println "GET book" id "finished in" (- after before) "ms")) | |
(>!! info-ch [id info]) | |
(doseq [-id ids] | |
(when-not (@seen -id) | |
;; new id set nil for restart bootstrap | |
(wcar* (car/set id nil)) | |
(>!! id-ch i))) | |
(swap! seen clojure.set/union ids) | |
(recur (inc i))))) | |
(defn go-write | |
[info-ch] | |
(go-loop [] | |
(let [[id info] (<! info-ch)] | |
(if-not (wcar* (car/keys id)) | |
(wcar* (car/set id info)) | |
;; else only overwrite nil | |
(when info | |
(wcar* (car/set id info))))) | |
(recur))) | |
(defn -main | |
[] | |
(let [bookid-ch (chan 500000) | |
info-ch (chan 10000)] | |
(a/thread | |
(loop [] | |
(<!! (a/timeout (* 15 1000))) | |
(println "seen urls" (count @seen) "book remaining" (count (.buf bookid-ch))) | |
(recur))) | |
;; bootstrap using keys with nil value | |
(loop [keys [] | |
s (vec @seen)] | |
(let [k (first s) | |
r (rest s)] | |
(if-not k | |
(doseq [key keys] | |
(>!! bookid-ch key)) | |
(if-not (empty? (wcar* (car/get k))) ;; else check if nippy/nil or "" | |
(recur keys r) | |
(recur (conj keys k) r))))) | |
;; n crawler | |
(doseq [_ (range 12)] | |
(a/thread (go-crawl bookid-ch info-ch))) | |
;; single redis writer | |
(<!! (go-write info-ch)))) |
This comment has been minimized.
This comment has been minimized.
导出 def extract_info(data):
# Title, Author, ISBN,
# Publisher, Binding, Year Published, Original Publication Year, Date Read, Date Added, Bookshelves, My Review
title, remain = data.split("<div id=\"info")
author = re.findall("(?<=作者</span>:)[\s\S]*?</span>", remain)
if not author:
author = re.findall("(?<=作者:</span>)[\s\S]*?</span>", remain)
if not author:
author = re.findall("(?<=译者:</span>)[\s\S]*?</span>", remain)
if not author:
author = re.findall("(?<=译者</span>:)[\s\S]*?</span>", remain)
if not author:
author = ["Anonymous"] #HACK
author = author[0].strip()
if "<a" in author:
author = re.findall("(?<=>)[\s\S]*?(?=<)", author)[0].strip()
isbn = re.findall("(?<=ISBN:</span> )\d+", remain)
if not isbn:
return None, None
isbn = isbn[0]
year = re.findall("(?<=出版年:</span>).*(?=<)", remain)
if year:
year = year[0].strip()
publisher = re.findall("出版社.*</span>.*?(?=<)", remain)
if publisher:
publisher = publisher[0].split("</span>")[1].strip()
else:
publisher = ""
return isbn, [author, title, year, publisher]
isbn_to_info = dict()
isbn_to_id = dict()
keys = r.keys() # 所有爬下来的
for k in keys:
data = r.get(k).decode()
if data == "页面不存在" or data == "条目不存在":
continue
isbn, row = extract_info(data)
if isbn:
isbn_to_info[isbn] = row
isbn_to_id[isbn] = k
# 用到了上面的 isbn_added 和 isbn_shelf
books = []
for isbn in isbns:
isbn = str(isbn)
shelf = isbn_shelf.get(isbn, "to-read")
row = isbn_to_info.get(isbn)
if not row:
continue
added = isbn_added.get(isbn, "")
shelf = isbn_shelf.get(isbn, "currently-reading")
author, title, date, publisher = row
row = [isbn, added, shelf]+row
books.append(row)
# remaining dump
with open("books-remaining.csv", 'w') as f:
wr = csv.writer(f, quoting=csv.QUOTE_ALL)
wr.writerow(["ISBN", "Date Added", "Bookshelves", "Author", "Publisher", "Title", "Year Published"])
wr.writerows(books)``` |
This comment has been minimized.
This comment has been minimized.
创建新书: ```python
import requests
import re
import time
# Goodreads cookies里的这几个都需要
s = requests.Session()
s.cookies.set("ccsid", "")
s.cookies.set("u", "")
s.cookies.set("p", "")
s.cookies.set("_session_id2", "")
s.headers["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
def create_one(isbn, author, title, publisher):
rsp = s.get("https://www.goodreads.com/book/new")
csrf = re.findall('(?<=csrf-token" content=").*(?="\ )', rsp.text)[0]
pdata = {
"utf-8": "✓",
"authenticity_token": csrf,
"book[title]": title,
"book[sort_by_title]": title,
"author[name]": author,
"book[isbn]": isbn,
"book[publisher]": publisher,
"book[update_default_description]": 1,
"work[media_type]": "book",
"commit": "Create book",
}
rsp = s.post("https://www.goodreads.com/book/new", data=pdata)
if title in rsp.text:
print(title, "created at", rsp.url)
# csv_data是个list of list
for d in csv_data:
isbn, date, _, author, title, date_create, publisher = d
create_one(isbn, author, title, publisher)
time.sleep(1) |
This comment has been minimized.
This comment has been minimized.
现在 https://www.goodreads.com/book/new 页面需要 reCAPTCHA,是不是没法用脚本添加书籍了? |
This comment has been minimized.
This comment has been minimized.
大概是吧…… 我用的时候还没有recaptcha
…On Wed, Apr 15, 2020, 23:56 Leo Liang ***@***.***> wrote:
***@***.**** commented on this gist.
------------------------------
现在 https://www.goodreads.com/book/new 页面需要 reCAPTCHA,是不是没法用脚本添加书籍了?
—
You are receiving this because you authored the thread.
Reply to this email directly, view it on GitHub
<https://gist.github.com/5b012985008708d96b22102258c1b90d#gistcomment-3255641>,
or unsubscribe
<https://github.com/notifications/unsubscribe-auth/AAJ2AUOATWY72M5E4F7MSPTRMXKKTANCNFSM4LD77JYA>
.
|
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This comment has been minimized.
第一个csv