apnorton/jaiminisbox_download.py

## jaiminisbox_download.py
import os
import sys
import logging
import requests
from functools import partial
from bs4 import BeautifulSoup
import multiprocessing
from multiprocessing import Pool

logging.basicConfig(level=logging.INFO)

# converts a row of the toc to a dictionary consisting of three
# values: title,url
# url is to the zip file
def row_to_dict(e):
    url = e.find("a")["href"]
    title = e.find("div", class_="title").text
    return {"url":url,"title":title}

def download(url, base_filepath):
    remainer = url.split("download", 1)[1]
    metadata = list(filter(None, remainer.split("/")))
    name = metadata[0]
    language = metadata[1]
    volume = int(metadata[2])
    chapter = int(metadata[3])

    if language != "en" and language != "en-us":
        logging.warning("Not downloading %s as it is not English", url)
        return

    location = f"{name}/vol{volume:02}"
    filename = f"chapter{chapter:03}.zip"
    full_path = os.path.join(base_filepath, location, filename)

    if (os.path.isfile(full_path)):
        logging.debug("Skipping %s as it is already downloaded", url)
        return

    logging.info("Downloading %s: Vol. %d Chapt. %d", name, volume, chapter)

    # ensure parent dirs are created
    os.makedirs(os.path.join(base_filepath, location), exist_ok=True)
    resp = requests.get(url, stream=True)
    with open(full_path, "wb") as f:
        for block in resp.iter_content(1024):
            f.write(block)

# series_url: string for series base page (e.g. https://jaiminisbox.com/reader/series/solo-leveling/)
# returns true if successful
def download_series(series_url, base_filepath="."):
    resp = requests.get(series_url)
    if resp.status_code != 200:
        logging.fatal("Failure to read series url")
        return False

    soup = BeautifulSoup(resp.text, "html.parser")
    elements = soup.find_all("div", {"class":"element"})
    chapter_urls = [row_to_dict(e)["url"] for e in elements]

    # limit processes based on avail CPUs (see docs for os.cpu_count())
    pool_sz = max(len(os.sched_getaffinity(0)) - 1, 1) * 2
    logging.info("Creating a pool of size %d", pool_sz)

    with Pool(pool_sz) as p:
        p.map(partial(download, base_filepath=base_filepath), chapter_urls)

    return True

if __name__ == "__main__":
    download_series(sys.argv[1])
	import os
	import sys
	import logging
	import requests
	from functools import partial
	from bs4 import BeautifulSoup
	import multiprocessing
	from multiprocessing import Pool

	logging.basicConfig(level=logging.INFO)

	# converts a row of the toc to a dictionary consisting of three
	# values: title,url
	# url is to the zip file
	def row_to_dict(e):
	url = e.find("a")["href"]
	title = e.find("div", class_="title").text
	return {"url":url,"title":title}

	def download(url, base_filepath):
	remainer = url.split("download", 1)[1]
	metadata = list(filter(None, remainer.split("/")))
	name = metadata[0]
	language = metadata[1]
	volume = int(metadata[2])
	chapter = int(metadata[3])

	if language != "en" and language != "en-us":
	logging.warning("Not downloading %s as it is not English", url)
	return

	location = f"{name}/vol{volume:02}"
	filename = f"chapter{chapter:03}.zip"
	full_path = os.path.join(base_filepath, location, filename)

	if (os.path.isfile(full_path)):
	logging.debug("Skipping %s as it is already downloaded", url)
	return

	logging.info("Downloading %s: Vol. %d Chapt. %d", name, volume, chapter)

	# ensure parent dirs are created
	os.makedirs(os.path.join(base_filepath, location), exist_ok=True)
	resp = requests.get(url, stream=True)
	with open(full_path, "wb") as f:
	for block in resp.iter_content(1024):
	f.write(block)

	# series_url: string for series base page (e.g. https://jaiminisbox.com/reader/series/solo-leveling/)
	# returns true if successful
	def download_series(series_url, base_filepath="."):
	resp = requests.get(series_url)
	if resp.status_code != 200:
	logging.fatal("Failure to read series url")
	return False

	soup = BeautifulSoup(resp.text, "html.parser")
	elements = soup.find_all("div", {"class":"element"})
	chapter_urls = [row_to_dict(e)["url"] for e in elements]

	# limit processes based on avail CPUs (see docs for os.cpu_count())
	pool_sz = max(len(os.sched_getaffinity(0)) - 1, 1) * 2
	logging.info("Creating a pool of size %d", pool_sz)

	with Pool(pool_sz) as p:
	p.map(partial(download, base_filepath=base_filepath), chapter_urls)

	return True

	if __name__ == "__main__":
	download_series(sys.argv[1])