majabojarska/scrape_poorly_drawn_lines.py

## scrape_poorly_drawn_lines.py
"""
Downloads all comics from the Poorly Drawn Lines archive.

Comic images are saved in directory defined by _OUTPUT_DIR.
"""

from pathlib import Path
from typing import Iterable, Tuple, Optional
from urllib.parse import urlparse

import bs4
import requests

_URL_ARCHIVE: str = "https://poorlydrawnlines.com/archive/"
_PARSER_HTML: str = "html.parser"
_OUTPUT_DIR = Path("output")


def _get_posts_in_archive() -> Iterable[Tuple[str, str]]:
    res_archive = requests.get(_URL_ARCHIVE)
    archive = bs4.BeautifulSoup(res_archive.text, _PARSER_HTML)
    div_content: bs4.Tag = archive.find("div", class_="content page")
    post_anchors: Iterable[bs4.Tag] = div_content.find_all("a", href=True)

    return [(elem.text, elem.attrs["href"]) for elem in post_anchors]


def _get_img_url_from_post(url: str) -> str:
    div_post = bs4.BeautifulSoup(requests.get(url).text, _PARSER_HTML)
    img: Optional[bs4.Tag] = div_post.find("div", class_="post").find("img")

    if not img:
        raise ValueError("Failed to find 'img' tag in the target post.")

    return img.attrs["src"]


def _get_img(url: str, path: Path) -> None:
    with requests.get(url, stream=True) as res:
        res.raise_for_status()

        with open(path, "wb") as img:
            for chunk in res.iter_content(chunk_size=8192):
                img.write(chunk)


def _scrape_comics_from_archive(path_output: Path):
    for title, url_post in _get_posts_in_archive():
        try:
            url_img: str = _get_img_url_from_post(url_post)
        except ValueError:
            print(f"Skipping {url_post} due to missing 'img' tag.")
            continue

        path_img: Path = path_output / Path(urlparse(url_img).path).name

        if path_img.exists():
            print(f"{path_img.name} is already downloaded, skipping.")
        else:
            _get_img(url=url_img, path=path_img)
            print(path_img.name)


if __name__ == "__main__":
    _OUTPUT_DIR.mkdir(exist_ok=True, parents=True)
    _scrape_comics_from_archive(_OUTPUT_DIR)
	"""
	Downloads all comics from the Poorly Drawn Lines archive.

	Comic images are saved in directory defined by _OUTPUT_DIR.
	"""

	from pathlib import Path
	from typing import Iterable, Tuple, Optional
	from urllib.parse import urlparse

	import bs4
	import requests

	_URL_ARCHIVE: str = "https://poorlydrawnlines.com/archive/"
	_PARSER_HTML: str = "html.parser"
	_OUTPUT_DIR = Path("output")


	def _get_posts_in_archive() -> Iterable[Tuple[str, str]]:
	res_archive = requests.get(_URL_ARCHIVE)
	archive = bs4.BeautifulSoup(res_archive.text, _PARSER_HTML)
	div_content: bs4.Tag = archive.find("div", class_="content page")
	post_anchors: Iterable[bs4.Tag] = div_content.find_all("a", href=True)

	return [(elem.text, elem.attrs["href"]) for elem in post_anchors]


	def _get_img_url_from_post(url: str) -> str:
	div_post = bs4.BeautifulSoup(requests.get(url).text, _PARSER_HTML)
	img: Optional[bs4.Tag] = div_post.find("div", class_="post").find("img")

	if not img:
	raise ValueError("Failed to find 'img' tag in the target post.")

	return img.attrs["src"]


	def _get_img(url: str, path: Path) -> None:
	with requests.get(url, stream=True) as res:
	res.raise_for_status()

	with open(path, "wb") as img:
	for chunk in res.iter_content(chunk_size=8192):
	img.write(chunk)


	def _scrape_comics_from_archive(path_output: Path):
	for title, url_post in _get_posts_in_archive():
	try:
	url_img: str = _get_img_url_from_post(url_post)
	except ValueError:
	print(f"Skipping {url_post} due to missing 'img' tag.")
	continue

	path_img: Path = path_output / Path(urlparse(url_img).path).name

	if path_img.exists():
	print(f"{path_img.name} is already downloaded, skipping.")
	else:
	_get_img(url=url_img, path=path_img)
	print(path_img.name)


	if __name__ == "__main__":
	_OUTPUT_DIR.mkdir(exist_ok=True, parents=True)
	_scrape_comics_from_archive(_OUTPUT_DIR)