bebosudo/manga_fetcher.py

## manga_fetcher.py
#!/usr/bin/env python
#
# Install selenium with 'pip install selenium' and the related
# geckodriver for the Firefox browser.
# Extracts all chapters of the given manga url, but downloads only the given
# chapter and the following ones, to avoid fetch already downloaded chapters.
#
# The trick is to use a small piece of js code to trigger the download,
# which will auto-download since we load Firefox with a profile that
# never asks to download mangaeden images.
# Selenium is required because the CloudFront CDN recognizes non-browser
# downloads such as curl, even if you use cookies copied from a browser.
#

from selenium import webdriver
import re
import sys
import json
import time
from pathlib import Path
from pprint import pprint


FILENAME_DIGITS = 5  # Save output files with {N} digits -> up to (10**N)-1 images per chapter
FILENAME_SEP = "___"
COOLDOWN_SECONDS_BETWEEN_IMAGES = 1  # Below 1 sec you may get 503 errors
COOLDOWN_SECONDS_BETWEEN_CHAPTERS = 15


class MangaURL:
    def __init__(self, user_url):
        self._user_url = user_url

        self._url_re = r"https?:\/\/(?:www\.)?mangaeden\.com\/en\/(.*)\/(.*)\/(.+)\/\d+\/"
        self._manga_url_base = "https://www.mangaeden.com/en/{}/{}/{}/1/"

        self._setup_browser()
        self._extract_manga_name_and_chapter()
        self._main()


    def _setup_browser(self):
        fp = webdriver.FirefoxProfile()
        fp.set_preference("browser.helperApps.neverAsk.saveToDisk",
                          "image/jpeg,image/jpg,image/png")
        self._browser = webdriver.Firefox(firefox_profile=fp)


    def _extract_manga_name_and_chapter(self):
        match = re.compile(self._url_re).match(self._user_url)

        if match:
            self._mangaeden, self.manga_name, self.ch_start = match.groups()
        else:
            raise SystemExit("URL {} is not from mangaeden.com; exit.".format(
                self._user_url))


    def _extract_manga_chapters_reverse(self):
        self._browser.get(self._user_url)

        try:
            xpath_box = '//*[@id="combobox"]'
            chapters_prnt = self._browser.find_elements_by_xpath(xpath_box)[0]
        except IndexError as e:
            raise SystemExit("Couldn't find the chapter list in the page; "
                             "is the url correct? Check if the page is a 404.")

        chapter_html_elem = chapters_prnt.find_elements_by_tag_name('option')
        return [el.text for el in chapter_html_elem]


    def _create_ch_url(self, chapter_num):
        return self._manga_url_base.format(self._mangaeden, self.manga_name,
                                           chapter_num)


    def _extract_chapters_to_download(self):
        all_ch = self._extract_manga_chapters_reverse()

        # Chapters in the html page are in reverse order.
        ch_after_start = all_ch[all_ch.index(self.ch_start)::-1]
        return [(name, self._create_ch_url(name)) for name in ch_after_start]


    def _extract_chapter_pages_list(self, chapter_url):
        self._browser.get(chapter_url)

        regex = r"var\ pages\ =\ \[.*\]"

        match = re.compile(regex).search(self._browser.page_source)
        if not match:
            raise SystemExit("Couldn't find the js array within the chapter "
                             "source page")

        text = match.group()
        list_img = json.loads(text[text.find("["):])
        return list_img


    def _fetch_all_images(self, ch_name, list_img):
        # Move browser to first image path to avoid CORS error due to different
        # cdn subdomain. Downloads continue in background thanks to js.
        self._browser.get("https:" + list_img[0]["fs"])

        for img_dict in list_img:
            img_url = "https:" + img_dict["fs"]
            n_padded = f'{img_dict["n"]+1:0{FILENAME_DIGITS}}'

            js_fetcher = """
                fetch('{}')
                  .then(resp => resp.blob())
                  .then(blob => {{
                    const url = window.URL.createObjectURL(blob);
                    const a = document.createElement('a');
                    a.style.display = 'none';
                    a.href = url;
                    a.download = '{}.jpg'; // filename to save the file as
                    document.body.appendChild(a);
                    a.click();
                    window.URL.revokeObjectURL(url);
                  }})
                  .catch(() => alert('oh no!'));
                """.format(img_url, ch_name + FILENAME_SEP + n_padded)

            self._browser.execute_script(js_fetcher)
            print(f'{n_padded}/{len(list_img):0{FILENAME_DIGITS}}:', img_url)
            time.sleep(COOLDOWN_SECONDS_BETWEEN_IMAGES)


    def _rename_downloaded_files(self, ch_name):
        downloaded_files = Path.cwd().glob(ch_name + FILENAME_SEP + "*")
        if not downloaded_files:
            print("Execute this script from within the Download dir where "
                  "files are downloaded into to reorganize them in subdirs.",
                  file=sys.stderr)

        for p in downloaded_files:
            Path(ch_name).mkdir(exist_ok=True)
            p.rename(Path(ch_name) / p.name[p.name.find(FILENAME_SEP)+len(FILENAME_SEP):])


    def _main(self):
        chapters_list = self._extract_chapters_to_download()
        print("Chapters to download:")
        pprint(chapters_list)

        for ch_name, ch_url in chapters_list:
            list_img = self._extract_chapter_pages_list(ch_url)

            print("Start downloading chapter: {}".format(ch_url))

            self._fetch_all_images(ch_name, list_img)

            print("Wait for {} seconds for files to finish download".format(
                    ch_url, COOLDOWN_SECONDS_BETWEEN_CHAPTERS
                 ))

            time.sleep(COOLDOWN_SECONDS_BETWEEN_CHAPTERS)
            self._rename_downloaded_files(ch_name)

            print("Chapter {} completed, now wait for {} seconds to cooldown".format(
                    ch_url, COOLDOWN_SECONDS_BETWEEN_CHAPTERS
                 ))

            time.sleep(COOLDOWN_SECONDS_BETWEEN_CHAPTERS)


if __name__ == "__main__":
    try:
        user_url = sys.argv[1]
    except IndexError as e:
        raise SystemExit("usage: {} mangaeden-url\n\n"
                         "Missing url of the manga on mangaeden that you "
                         "want to download, e.g. "
                         "https://www.mangaeden.com/en/en-manga/"
                         "onepunch-man/10/3/ and it will download all the "
                         "chapters following the one you "
                         "provide".format(sys.argv[0]))

    manga_url = MangaURL(user_url)
	#!/usr/bin/env python
	#
	# Install selenium with 'pip install selenium' and the related
	# geckodriver for the Firefox browser.
	# Extracts all chapters of the given manga url, but downloads only the given
	# chapter and the following ones, to avoid fetch already downloaded chapters.
	#
	# The trick is to use a small piece of js code to trigger the download,
	# which will auto-download since we load Firefox with a profile that
	# never asks to download mangaeden images.
	# Selenium is required because the CloudFront CDN recognizes non-browser
	# downloads such as curl, even if you use cookies copied from a browser.
	#

	from selenium import webdriver
	import re
	import sys
	import json
	import time
	from pathlib import Path
	from pprint import pprint


	FILENAME_DIGITS = 5 # Save output files with {N} digits -> up to (10**N)-1 images per chapter
	FILENAME_SEP = "___"
	COOLDOWN_SECONDS_BETWEEN_IMAGES = 1 # Below 1 sec you may get 503 errors
	COOLDOWN_SECONDS_BETWEEN_CHAPTERS = 15


	class MangaURL:
	def __init__(self, user_url):
	self._user_url = user_url

	self._url_re = r"https?:\/\/(?:www\.)?mangaeden\.com\/en\/(.)\/(.)\/(.+)\/\d+\/"
	self._manga_url_base = "https://www.mangaeden.com/en/{}/{}/{}/1/"

	self._setup_browser()
	self._extract_manga_name_and_chapter()
	self._main()


	def _setup_browser(self):
	fp = webdriver.FirefoxProfile()
	fp.set_preference("browser.helperApps.neverAsk.saveToDisk",
	"image/jpeg,image/jpg,image/png")
	self._browser = webdriver.Firefox(firefox_profile=fp)


	def _extract_manga_name_and_chapter(self):
	match = re.compile(self._url_re).match(self._user_url)

	if match:
	self._mangaeden, self.manga_name, self.ch_start = match.groups()
	else:
	raise SystemExit("URL {} is not from mangaeden.com; exit.".format(
	self._user_url))


	def _extract_manga_chapters_reverse(self):
	self._browser.get(self._user_url)

	try:
	xpath_box = '//*[@id="combobox"]'
	chapters_prnt = self._browser.find_elements_by_xpath(xpath_box)[0]
	except IndexError as e:
	raise SystemExit("Couldn't find the chapter list in the page; "
	"is the url correct? Check if the page is a 404.")

	chapter_html_elem = chapters_prnt.find_elements_by_tag_name('option')
	return [el.text for el in chapter_html_elem]


	def _create_ch_url(self, chapter_num):
	return self._manga_url_base.format(self._mangaeden, self.manga_name,
	chapter_num)


	def _extract_chapters_to_download(self):
	all_ch = self._extract_manga_chapters_reverse()

	# Chapters in the html page are in reverse order.
	ch_after_start = all_ch[all_ch.index(self.ch_start)::-1]
	return [(name, self._create_ch_url(name)) for name in ch_after_start]


	def _extract_chapter_pages_list(self, chapter_url):
	self._browser.get(chapter_url)

	regex = r"var\ pages\ =\ \[.*\]"

	match = re.compile(regex).search(self._browser.page_source)
	if not match:
	raise SystemExit("Couldn't find the js array within the chapter "
	"source page")

	text = match.group()
	list_img = json.loads(text[text.find("["):])
	return list_img


	def _fetch_all_images(self, ch_name, list_img):
	# Move browser to first image path to avoid CORS error due to different
	# cdn subdomain. Downloads continue in background thanks to js.
	self._browser.get("https:" + list_img[0]["fs"])

	for img_dict in list_img:
	img_url = "https:" + img_dict["fs"]
	n_padded = f'{img_dict["n"]+1:0{FILENAME_DIGITS}}'

	js_fetcher = """
	fetch('{}')
	.then(resp => resp.blob())
	.then(blob => {{
	const url = window.URL.createObjectURL(blob);
	const a = document.createElement('a');
	a.style.display = 'none';
	a.href = url;
	a.download = '{}.jpg'; // filename to save the file as
	document.body.appendChild(a);
	a.click();
	window.URL.revokeObjectURL(url);
	}})
	.catch(() => alert('oh no!'));
	""".format(img_url, ch_name + FILENAME_SEP + n_padded)

	self._browser.execute_script(js_fetcher)
	print(f'{n_padded}/{len(list_img):0{FILENAME_DIGITS}}:', img_url)
	time.sleep(COOLDOWN_SECONDS_BETWEEN_IMAGES)


	def _rename_downloaded_files(self, ch_name):
	downloaded_files = Path.cwd().glob(ch_name + FILENAME_SEP + "*")
	if not downloaded_files:
	print("Execute this script from within the Download dir where "
	"files are downloaded into to reorganize them in subdirs.",
	file=sys.stderr)

	for p in downloaded_files:
	Path(ch_name).mkdir(exist_ok=True)
	p.rename(Path(ch_name) / p.name[p.name.find(FILENAME_SEP)+len(FILENAME_SEP):])


	def _main(self):
	chapters_list = self._extract_chapters_to_download()
	print("Chapters to download:")
	pprint(chapters_list)

	for ch_name, ch_url in chapters_list:
	list_img = self._extract_chapter_pages_list(ch_url)

	print("Start downloading chapter: {}".format(ch_url))

	self._fetch_all_images(ch_name, list_img)

	print("Wait for {} seconds for files to finish download".format(
	ch_url, COOLDOWN_SECONDS_BETWEEN_CHAPTERS
	))

	time.sleep(COOLDOWN_SECONDS_BETWEEN_CHAPTERS)
	self._rename_downloaded_files(ch_name)

	print("Chapter {} completed, now wait for {} seconds to cooldown".format(
	ch_url, COOLDOWN_SECONDS_BETWEEN_CHAPTERS
	))

	time.sleep(COOLDOWN_SECONDS_BETWEEN_CHAPTERS)


	if __name__ == "__main__":
	try:
	user_url = sys.argv[1]
	except IndexError as e:
	raise SystemExit("usage: {} mangaeden-url\n\n"
	"Missing url of the manga on mangaeden that you "
	"want to download, e.g. "
	"https://www.mangaeden.com/en/en-manga/"
	"onepunch-man/10/3/ and it will download all the "
	"chapters following the one you "
	"provide".format(sys.argv[0]))

	manga_url = MangaURL(user_url)