Skip to content

Instantly share code, notes, and snippets.

@Destaq
Last active April 30, 2021 19:59
Show Gist options
  • Save Destaq/bf4378951b8daa96ae5f863dfc7f6a04 to your computer and use it in GitHub Desktop.
Save Destaq/bf4378951b8daa96ae5f863dfc7f6a04 to your computer and use it in GitHub Desktop.
Create a ready-to-read/analyze dushu369.com .txt file from the TOC link
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
def generate_book_file(homepage_link):
"""
`homepage_link` is the link to the TOC of the book, e.g. http://www.dushu369.com/tonghua/bhldmm/
"""
page = session.get(homepage_link)
soup = BeautifulSoup(page.content.decode("gb2312", errors="ignore"), "html.parser")
title = soup.find("td", class_="cntitle").text
links = soup.find_all("a", class_="a0")
newfile = open(
f"books/{title}.txt", "w+"
) # NOTE: change for specific write location, must first create `books` folder
count = 1
for link in links:
print(f"Scraping chapter {count} of {len(links)}", end="\r")
chapter_page = session.get("http://www.dushu369.com" + link["href"])
soup = BeautifulSoup(chapter_page.content.decode("gb2312", errors="ignore"), "html.parser")
chapter_title = soup.find("td", class_="cntitle").text
newfile.write(chapter_title + "\n\n")
text = soup.find("td", class_="content").text
newfile.write(text + "\n\n\n")
count += 1
newfile.close()
# link-separated file (e.g. generated through bs4) of full dushu links for many downloads at once
# if __name__ == "__main__":
# file = open("links.txt", "r")
# file_lines = file.readlines()
# for i in range(len(file_lines)):
# print(f"Scraping book {i + 1} of {len(file_lines)}.")
# generate_book_file(file_lines[i][:-1])
# file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment