Skip to content

Instantly share code, notes, and snippets.

@2niuhe
Created June 28, 2020 16:31
Show Gist options
  • Save 2niuhe/77cc5be8a5a8ffb8aafa6bf302a199d4 to your computer and use it in GitHub Desktop.
Save 2niuhe/77cc5be8a5a8ffb8aafa6bf302a199d4 to your computer and use it in GitHub Desktop.
天眼看小说爬虫
import git
import requests
import logging
import time
import json
import copy
from bs4 import BeautifulSoup
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
# create file handler which logs even debug messages
fh = logging.FileHandler('tianyan.log')
fh.setLevel(logging.DEBUG)
# create console handler with a higher log level
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(fh)
logger.addHandler(ch)
logger.info('You can find this written in myLogs.log')
def my_request(url, retry_time=5):
i = 0
while i < retry_time:
try:
r = requests.get(url, timeout=30)
if r.status_code == 200:
return r.text
else:
raise Exception
except:
logger.info("Request Timeout, will retry later")
i = i + 1
time.sleep(30)
if i == 4:
logger.warning("多次尝试失败,休眠1024秒")
time.sleep(1024)
logger.error("Request Error, url:" + url)
return False
url_base = "https://novel.zhwenpg.com/"
"""
import git
import os, os.path
g = git.Git(os.path.expanduser("~/git/GitPython"))
result = g.execute(["git", "commit", "-m", "'message'"])
"""
class Book:
# book_info example:{"name":"逍遥小散仙","author":"迷男","cover":"image/cover/kun6m7.jpg","book_url":"b.php?id=kun6m7","TOC":[["楔子","r.php?id=35482"]]]}
def __init__(self, book_info):
self.book_info = book_info
self.name = book_info["name"]
self.author = book_info["author"]
self.boo_url = url_base + book_info["book_url"]
self.local_toc = self.get_local_toc()
if not self.local_toc["TOC"]:
logger.info("云端书籍目录如下,本地目录为空")
self.save_local_toc()
# 获取本地已经下载文件的最新一个章节,每个章节命令是[章节名,url]防止同名章节出现
# 用json保存下载完成的bookinfo
def get_local_toc(self):
logger.info("Loading Local TOC of Book" + self.name + "+" + self.author)
try:
data = dict()
with open('./novel/' + self.name + '+' + self.author + '.json', 'r', encoding='utf-8') as f_toc:
data = json.load(f_toc)
except Exception:
data = dict()
if data:
return data
else:
data = copy.deepcopy(self.book_info)
data["TOC"] = list()
logger.info("Local TOC don't exist,will creat it")
return data
def save_local_toc(self):
f_toc = open('./novel/' + self.name + '+' + self.author + '.json', 'w', encoding='utf-8')
json.dump(self.local_toc, f_toc, ensure_ascii=False)
f_toc.close()
# 修改文件名字为name+author
def change_name(self):
pass
# 和云端目录进行比较,返回需要继续下载的目录,目前只从最后一节开始,不比较遗漏部分
def compare_with_remote(self):
logger.info("Gen Diff TOC of Book:" + self.name + "+" + self.author)
if not self.local_toc["TOC"]:
logger.info("本地已下载为空,从头开始下载")
return self.book_info["TOC"]
index = 0
if self.local_toc["TOC"]:
latest_chapter = self.local_toc["TOC"][-1]
if self.local_toc["TOC"][-1] in self.book_info["TOC"]:
index = self.book_info["TOC"].index(latest_chapter)
result = self.book_info["TOC"][index + 1:]
if not result:
logger.info("No Update in Book:" + self.name + self.author)
return result # 返回目录列表
def download(self):
f_book = open('./novel/' + self.name + '+' + self.author + '.txt', 'a', encoding='utf-8')
diff_toc = self.compare_with_remote()
for chapter in diff_toc:
logger.info("Downloading :" + self.name + ":" + chapter[0])
biaoti = '[color=#FF0000][b]' + chapter[0] + '[/b][/color]\n'
url = 'https://novel.zhwenpg.com/' + chapter[1]
data = my_request(url)
if data is False:
time.sleep(1024)
data = my_request(url)
soup = BeautifulSoup(data, 'lxml')
text = soup.select('#tdcontent > span > p')
story = list()
story.append(biaoti)
for i in text:
story.append(i.text.replace(u'\u3000', u''))
story = '\n'.join(story)
story = story + '\n'
f_book.write(story)
self.local_toc["TOC"].append(chapter)
self.save_local_toc()
f_book.close()
logger.info("下载完成: " + self.name + '+' + self.author)
# 书单生成器 每次返回一列书:[[书名,url后缀],]
def booklists():
url_base = "https://novel.zhwenpg.com/"
data = my_request(url_base)
soup = BeautifulSoup(data, 'lxml')
max_page = int(soup.select(".pageidx > a ")[-1]["href"].split('=')[-1])
page_base = "https://novel.zhwenpg.com/?page="
for page_num in range(1,max_page+1):
book_list = list()
logger.info("正在获取第" + str(page_num) + "列书")
url = page_base + str(page_num)
data = my_request(url)
soup = BeautifulSoup(data, 'lxml')
for link in soup.select("table > tr > td > div > a"):
logger.info("ADD book" + link.div.text)
book_list.append([link.div.text, link['href']]) # 书名,url
yield book_list
def push_remote():
try:
from git import Repo
import os
path = os.getcwd()
repo = Repo(path)
remote = repo.remote()
repo.git.add(all=True)
repo.index.commit("Update Latest Novel")
logger.info("Start Push New Commit to Github")
remote.push()
logger.info("Push New Novel to Github Success")
except Exception:
logger.error("Error:Fail to Push to Github.")
# 获取书籍信息,返回book_info
def get_bookinfo(book): # [name,url]
book_info = dict()
name = book[0]
url = book[1]
author = "无名氏"
cover = ""
logger.info("Start Download Book:" + name)
data = my_request(url_base + url)
soup = BeautifulSoup(data, 'lxml')
if soup.select(".fontwt"):
author = soup.select(".fontwt")[0].text
if soup.select(".ccover3"):
cover = soup.select(".ccover3")[0]["data-src"]
content = list()
for link in soup.select("#dulist > li > a[href]"):
tmp = [link.text, link['href']]
content.append(tmp)
for i in soup.select("#revbtn"): # 反转目录
if i.text == "正序":
content.reverse()
book_info["name"] = name
book_info["author"] = author
book_info["cover"] = cover
book_info["book_url"] = url
book_info["TOC"] = content
return book_info
def download_book(book_info):
book = Book(book_info)
book.download()
if __name__ == "__main__":
while True:
for booklist in booklists():
for book in booklist:
book_info = get_bookinfo(book)
download_book(book_info)
push_remote()
logger.info("END OF JOB,SLEEP A DAY")
time.sleep(3600*24)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment