Skip to content

Instantly share code, notes, and snippets.

@bebosudo
Created November 24, 2020 08:13
Show Gist options
  • Save bebosudo/e61501b122aec11826e3296639dc3f42 to your computer and use it in GitHub Desktop.
Save bebosudo/e61501b122aec11826e3296639dc3f42 to your computer and use it in GitHub Desktop.
Download all chapters of a given manga from mangaeden.com
#!/usr/bin/env python
#
# Install selenium with 'pip install selenium' and the related
# geckodriver for the Firefox browser.
# Extracts all chapters of the given manga url, but downloads only the given
# chapter and the following ones, to avoid fetch already downloaded chapters.
#
# The trick is to use a small piece of js code to trigger the download,
# which will auto-download since we load Firefox with a profile that
# never asks to download mangaeden images.
# Selenium is required because the CloudFront CDN recognizes non-browser
# downloads such as curl, even if you use cookies copied from a browser.
#
from selenium import webdriver
import re
import sys
import json
import time
from pathlib import Path
from pprint import pprint
FILENAME_DIGITS = 5 # Save output files with {N} digits -> up to (10**N)-1 images per chapter
FILENAME_SEP = "___"
COOLDOWN_SECONDS_BETWEEN_IMAGES = 1 # Below 1 sec you may get 503 errors
COOLDOWN_SECONDS_BETWEEN_CHAPTERS = 15
class MangaURL:
def __init__(self, user_url):
self._user_url = user_url
self._url_re = r"https?:\/\/(?:www\.)?mangaeden\.com\/en\/(.*)\/(.*)\/(.+)\/\d+\/"
self._manga_url_base = "https://www.mangaeden.com/en/{}/{}/{}/1/"
self._setup_browser()
self._extract_manga_name_and_chapter()
self._main()
def _setup_browser(self):
fp = webdriver.FirefoxProfile()
fp.set_preference("browser.helperApps.neverAsk.saveToDisk",
"image/jpeg,image/jpg,image/png")
self._browser = webdriver.Firefox(firefox_profile=fp)
def _extract_manga_name_and_chapter(self):
match = re.compile(self._url_re).match(self._user_url)
if match:
self._mangaeden, self.manga_name, self.ch_start = match.groups()
else:
raise SystemExit("URL {} is not from mangaeden.com; exit.".format(
self._user_url))
def _extract_manga_chapters_reverse(self):
self._browser.get(self._user_url)
try:
xpath_box = '//*[@id="combobox"]'
chapters_prnt = self._browser.find_elements_by_xpath(xpath_box)[0]
except IndexError as e:
raise SystemExit("Couldn't find the chapter list in the page; "
"is the url correct? Check if the page is a 404.")
chapter_html_elem = chapters_prnt.find_elements_by_tag_name('option')
return [el.text for el in chapter_html_elem]
def _create_ch_url(self, chapter_num):
return self._manga_url_base.format(self._mangaeden, self.manga_name,
chapter_num)
def _extract_chapters_to_download(self):
all_ch = self._extract_manga_chapters_reverse()
# Chapters in the html page are in reverse order.
ch_after_start = all_ch[all_ch.index(self.ch_start)::-1]
return [(name, self._create_ch_url(name)) for name in ch_after_start]
def _extract_chapter_pages_list(self, chapter_url):
self._browser.get(chapter_url)
regex = r"var\ pages\ =\ \[.*\]"
match = re.compile(regex).search(self._browser.page_source)
if not match:
raise SystemExit("Couldn't find the js array within the chapter "
"source page")
text = match.group()
list_img = json.loads(text[text.find("["):])
return list_img
def _fetch_all_images(self, ch_name, list_img):
# Move browser to first image path to avoid CORS error due to different
# cdn subdomain. Downloads continue in background thanks to js.
self._browser.get("https:" + list_img[0]["fs"])
for img_dict in list_img:
img_url = "https:" + img_dict["fs"]
n_padded = f'{img_dict["n"]+1:0{FILENAME_DIGITS}}'
js_fetcher = """
fetch('{}')
.then(resp => resp.blob())
.then(blob => {{
const url = window.URL.createObjectURL(blob);
const a = document.createElement('a');
a.style.display = 'none';
a.href = url;
a.download = '{}.jpg'; // filename to save the file as
document.body.appendChild(a);
a.click();
window.URL.revokeObjectURL(url);
}})
.catch(() => alert('oh no!'));
""".format(img_url, ch_name + FILENAME_SEP + n_padded)
self._browser.execute_script(js_fetcher)
print(f'{n_padded}/{len(list_img):0{FILENAME_DIGITS}}:', img_url)
time.sleep(COOLDOWN_SECONDS_BETWEEN_IMAGES)
def _rename_downloaded_files(self, ch_name):
downloaded_files = Path.cwd().glob(ch_name + FILENAME_SEP + "*")
if not downloaded_files:
print("Execute this script from within the Download dir where "
"files are downloaded into to reorganize them in subdirs.",
file=sys.stderr)
for p in downloaded_files:
Path(ch_name).mkdir(exist_ok=True)
p.rename(Path(ch_name) / p.name[p.name.find(FILENAME_SEP)+len(FILENAME_SEP):])
def _main(self):
chapters_list = self._extract_chapters_to_download()
print("Chapters to download:")
pprint(chapters_list)
for ch_name, ch_url in chapters_list:
list_img = self._extract_chapter_pages_list(ch_url)
print("Start downloading chapter: {}".format(ch_url))
self._fetch_all_images(ch_name, list_img)
print("Wait for {} seconds for files to finish download".format(
ch_url, COOLDOWN_SECONDS_BETWEEN_CHAPTERS
))
time.sleep(COOLDOWN_SECONDS_BETWEEN_CHAPTERS)
self._rename_downloaded_files(ch_name)
print("Chapter {} completed, now wait for {} seconds to cooldown".format(
ch_url, COOLDOWN_SECONDS_BETWEEN_CHAPTERS
))
time.sleep(COOLDOWN_SECONDS_BETWEEN_CHAPTERS)
if __name__ == "__main__":
try:
user_url = sys.argv[1]
except IndexError as e:
raise SystemExit("usage: {} mangaeden-url\n\n"
"Missing url of the manga on mangaeden that you "
"want to download, e.g. "
"https://www.mangaeden.com/en/en-manga/"
"onepunch-man/10/3/ and it will download all the "
"chapters following the one you "
"provide".format(sys.argv[0]))
manga_url = MangaURL(user_url)
@anphetamina
Copy link

Updated regex

https?:\/\/(?:www2?\.)?mangaeden\.com\/(?:en|it)\/(.*)\/(.*)\/(.+)\/\d+\/

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment