Last active
April 30, 2021 19:59
-
-
Save Destaq/bf4378951b8daa96ae5f863dfc7f6a04 to your computer and use it in GitHub Desktop.
Create a ready-to-read/analyze dushu369.com .txt file from the TOC link
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
from requests.adapters import HTTPAdapter | |
from requests.packages.urllib3.util.retry import Retry | |
session = requests.Session() | |
retry = Retry(connect=3, backoff_factor=0.5) | |
adapter = HTTPAdapter(max_retries=retry) | |
session.mount('http://', adapter) | |
session.mount('https://', adapter) | |
def generate_book_file(homepage_link): | |
""" | |
`homepage_link` is the link to the TOC of the book, e.g. http://www.dushu369.com/tonghua/bhldmm/ | |
""" | |
page = session.get(homepage_link) | |
soup = BeautifulSoup(page.content.decode("gb2312", errors="ignore"), "html.parser") | |
title = soup.find("td", class_="cntitle").text | |
links = soup.find_all("a", class_="a0") | |
newfile = open( | |
f"books/{title}.txt", "w+" | |
) # NOTE: change for specific write location, must first create `books` folder | |
count = 1 | |
for link in links: | |
print(f"Scraping chapter {count} of {len(links)}", end="\r") | |
chapter_page = session.get("http://www.dushu369.com" + link["href"]) | |
soup = BeautifulSoup(chapter_page.content.decode("gb2312", errors="ignore"), "html.parser") | |
chapter_title = soup.find("td", class_="cntitle").text | |
newfile.write(chapter_title + "\n\n") | |
text = soup.find("td", class_="content").text | |
newfile.write(text + "\n\n\n") | |
count += 1 | |
newfile.close() | |
# link-separated file (e.g. generated through bs4) of full dushu links for many downloads at once | |
# if __name__ == "__main__": | |
# file = open("links.txt", "r") | |
# file_lines = file.readlines() | |
# for i in range(len(file_lines)): | |
# print(f"Scraping book {i + 1} of {len(file_lines)}.") | |
# generate_book_file(file_lines[i][:-1]) | |
# file.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment