Created
July 3, 2020 20:09
-
-
Save apnorton/6d7fb0e1ab7a6d031b24f35317142628 to your computer and use it in GitHub Desktop.
Tool for downloading zips of scanlations from jaiminisbox. Usage: python jaiminisbox_download.py [url_to_series_page]
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import logging | |
import requests | |
from functools import partial | |
from bs4 import BeautifulSoup | |
import multiprocessing | |
from multiprocessing import Pool | |
logging.basicConfig(level=logging.INFO) | |
# converts a row of the toc to a dictionary consisting of three | |
# values: title,url | |
# url is to the zip file | |
def row_to_dict(e): | |
url = e.find("a")["href"] | |
title = e.find("div", class_="title").text | |
return {"url":url,"title":title} | |
def download(url, base_filepath): | |
remainer = url.split("download", 1)[1] | |
metadata = list(filter(None, remainer.split("/"))) | |
name = metadata[0] | |
language = metadata[1] | |
volume = int(metadata[2]) | |
chapter = int(metadata[3]) | |
if language != "en" and language != "en-us": | |
logging.warning("Not downloading %s as it is not English", url) | |
return | |
location = f"{name}/vol{volume:02}" | |
filename = f"chapter{chapter:03}.zip" | |
full_path = os.path.join(base_filepath, location, filename) | |
if (os.path.isfile(full_path)): | |
logging.debug("Skipping %s as it is already downloaded", url) | |
return | |
logging.info("Downloading %s: Vol. %d Chapt. %d", name, volume, chapter) | |
# ensure parent dirs are created | |
os.makedirs(os.path.join(base_filepath, location), exist_ok=True) | |
resp = requests.get(url, stream=True) | |
with open(full_path, "wb") as f: | |
for block in resp.iter_content(1024): | |
f.write(block) | |
# series_url: string for series base page (e.g. https://jaiminisbox.com/reader/series/solo-leveling/) | |
# returns true if successful | |
def download_series(series_url, base_filepath="."): | |
resp = requests.get(series_url) | |
if resp.status_code != 200: | |
logging.fatal("Failure to read series url") | |
return False | |
soup = BeautifulSoup(resp.text, "html.parser") | |
elements = soup.find_all("div", {"class":"element"}) | |
chapter_urls = [row_to_dict(e)["url"] for e in elements] | |
# limit processes based on avail CPUs (see docs for os.cpu_count()) | |
pool_sz = max(len(os.sched_getaffinity(0)) - 1, 1) * 2 | |
logging.info("Creating a pool of size %d", pool_sz) | |
with Pool(pool_sz) as p: | |
p.map(partial(download, base_filepath=base_filepath), chapter_urls) | |
return True | |
if __name__ == "__main__": | |
download_series(sys.argv[1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment