Destaq/generate_dushu369.py

## generate_dushu369.py
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

def generate_book_file(homepage_link):
    """
    `homepage_link` is the link to the TOC of the book, e.g. http://www.dushu369.com/tonghua/bhldmm/
    """
    page = session.get(homepage_link)

    soup = BeautifulSoup(page.content.decode("gb2312", errors="ignore"), "html.parser")
    title = soup.find("td", class_="cntitle").text

    links = soup.find_all("a", class_="a0")

    newfile = open(
        f"books/{title}.txt", "w+"
    )  # NOTE: change for specific write location, must first create `books` folder
    count = 1
    for link in links:
        print(f"Scraping chapter {count} of {len(links)}", end="\r")
        chapter_page = session.get("http://www.dushu369.com" + link["href"])

        soup = BeautifulSoup(chapter_page.content.decode("gb2312", errors="ignore"), "html.parser")
        chapter_title = soup.find("td", class_="cntitle").text

        newfile.write(chapter_title + "\n\n")

        text = soup.find("td", class_="content").text
        newfile.write(text + "\n\n\n")
        count += 1

    newfile.close()


# link-separated file (e.g. generated through bs4) of full dushu links for many downloads at once

# if __name__ == "__main__":
#     file = open("links.txt", "r")
#     file_lines = file.readlines()

#     for i in range(len(file_lines)):
#         print(f"Scraping book {i + 1} of {len(file_lines)}.")
#         generate_book_file(file_lines[i][:-1])

#     file.close()
	import requests
	from bs4 import BeautifulSoup
	from requests.adapters import HTTPAdapter
	from requests.packages.urllib3.util.retry import Retry

	session = requests.Session()
	retry = Retry(connect=3, backoff_factor=0.5)
	adapter = HTTPAdapter(max_retries=retry)
	session.mount('http://', adapter)
	session.mount('https://', adapter)

	def generate_book_file(homepage_link):
	"""
	`homepage_link` is the link to the TOC of the book, e.g. http://www.dushu369.com/tonghua/bhldmm/
	"""
	page = session.get(homepage_link)

	soup = BeautifulSoup(page.content.decode("gb2312", errors="ignore"), "html.parser")
	title = soup.find("td", class_="cntitle").text

	links = soup.find_all("a", class_="a0")

	newfile = open(
	f"books/{title}.txt", "w+"
	) # NOTE: change for specific write location, must first create `books` folder
	count = 1
	for link in links:
	print(f"Scraping chapter {count} of {len(links)}", end="\r")
	chapter_page = session.get("http://www.dushu369.com" + link["href"])

	soup = BeautifulSoup(chapter_page.content.decode("gb2312", errors="ignore"), "html.parser")
	chapter_title = soup.find("td", class_="cntitle").text

	newfile.write(chapter_title + "\n\n")

	text = soup.find("td", class_="content").text
	newfile.write(text + "\n\n\n")
	count += 1

	newfile.close()


	# link-separated file (e.g. generated through bs4) of full dushu links for many downloads at once

	# if __name__ == "__main__":
	# file = open("links.txt", "r")
	# file_lines = file.readlines()

	# for i in range(len(file_lines)):
	# print(f"Scraping book {i + 1} of {len(file_lines)}.")
	# generate_book_file(file_lines[i][:-1])

	# file.close()