Skip to content

Instantly share code, notes, and snippets.

@majabojarska
Created June 30, 2022 11:05
Show Gist options
  • Save majabojarska/311f63803325995472e71a2b651fc602 to your computer and use it in GitHub Desktop.
Save majabojarska/311f63803325995472e71a2b651fc602 to your computer and use it in GitHub Desktop.
Downloads all comics from the Poorly Drawn Lines archive.
"""
Downloads all comics from the Poorly Drawn Lines archive.
Comic images are saved in directory defined by _OUTPUT_DIR.
"""
from pathlib import Path
from typing import Iterable, Tuple, Optional
from urllib.parse import urlparse
import bs4
import requests
_URL_ARCHIVE: str = "https://poorlydrawnlines.com/archive/"
_PARSER_HTML: str = "html.parser"
_OUTPUT_DIR = Path("output")
def _get_posts_in_archive() -> Iterable[Tuple[str, str]]:
res_archive = requests.get(_URL_ARCHIVE)
archive = bs4.BeautifulSoup(res_archive.text, _PARSER_HTML)
div_content: bs4.Tag = archive.find("div", class_="content page")
post_anchors: Iterable[bs4.Tag] = div_content.find_all("a", href=True)
return [(elem.text, elem.attrs["href"]) for elem in post_anchors]
def _get_img_url_from_post(url: str) -> str:
div_post = bs4.BeautifulSoup(requests.get(url).text, _PARSER_HTML)
img: Optional[bs4.Tag] = div_post.find("div", class_="post").find("img")
if not img:
raise ValueError("Failed to find 'img' tag in the target post.")
return img.attrs["src"]
def _get_img(url: str, path: Path) -> None:
with requests.get(url, stream=True) as res:
res.raise_for_status()
with open(path, "wb") as img:
for chunk in res.iter_content(chunk_size=8192):
img.write(chunk)
def _scrape_comics_from_archive(path_output: Path):
for title, url_post in _get_posts_in_archive():
try:
url_img: str = _get_img_url_from_post(url_post)
except ValueError:
print(f"Skipping {url_post} due to missing 'img' tag.")
continue
path_img: Path = path_output / Path(urlparse(url_img).path).name
if path_img.exists():
print(f"{path_img.name} is already downloaded, skipping.")
else:
_get_img(url=url_img, path=path_img)
print(path_img.name)
if __name__ == "__main__":
_OUTPUT_DIR.mkdir(exist_ok=True, parents=True)
_scrape_comics_from_archive(_OUTPUT_DIR)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment