Skip to content

Instantly share code, notes, and snippets.

@cacharle
Created August 15, 2020 09:42
Show Gist options
  • Save cacharle/471e34ffe21ed695d34e3c7486e02579 to your computer and use it in GitHub Desktop.
Save cacharle/471e34ffe21ed695d34e3c7486e02579 to your computer and use it in GitHub Desktop.
Scrape some website for the avatar comics
#!/usr/bin/python3
import os
import sys
import itertools
import requests
from bs4 import BeautifulSoup
url_fmt = "https://www.omgbeaupeep.com/comics/Avatar_The_Last_Airbender/{:03}/{}/"
dir_name = "avatar_comics"
try:
os.mkdir(dir_name)
except FileExistsError:
pass
if len(sys.argv) != 3:
print(f"Usage: {sys.argv[0]} from to")
sys.exit(1)
comic_from = int(sys.argv[1])
comic_to = int(sys.argv[2])
for comic_id in range(comic_from, comic_to + 1):
print(f"Downloading comic {comic_id:03}")
comic_dir = os.path.join(dir_name, f"comic-{comic_id:03}")
try:
os.mkdir(comic_dir)
except FileExistsError:
pass
for page_id in itertools.count(1):
respond = requests.get(
f"https://www.omgbeaupeep.com/comics/Avatar_The_Last_Airbender/{comic_id:03}/{page_id}/")
if respond.status_code != 200:
print(f"Couldn't download page {page_id}")
raise IOError
soup = BeautifulSoup(respond.content, "html.parser")
try:
img = [s for s in soup.find_all("img") if s.get("class") is not None][0]
except IndexError:
break
img_src = img.get("src")
b = os.path.basename(img_src)
page_path = os.path.join(comic_dir, b[b.rfind("-") + 1:])
print(f"Downloading page {page_id} to {page_path}")
respond = requests.get(
f"https://www.omgbeaupeep.com/comics/{img_src}")
if respond.status_code != 200:
print(f"Couldn't download page src {page_id}")
raise IOError
with open(page_path, "wb") as f:
f.write(respond.content)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment