Skip to content

Instantly share code, notes, and snippets.

@noelleleigh
Last active October 27, 2017 15:22
Show Gist options
  • Save noelleleigh/7a3a30da87b592c054018136fbb59ab5 to your computer and use it in GitHub Desktop.
Save noelleleigh/7a3a30da87b592c054018136fbb59ab5 to your computer and use it in GitHub Desktop.
Python 3 script for downloading all the pages of the webcomic On a Sunbeam (http://www.onasunbeam.com/). Requires Requests (http://docs.python-requests.org/) and Beautiful Soup 4 (https://www.crummy.com/software/BeautifulSoup/).
#!/usr/bin/env python3
'''Dowload the comic *On a Sunbeam*'''
import shutil
from urllib.parse import urljoin, urlparse
from os.path import basename
import requests
from bs4 import BeautifulSoup
def get_chapter_links(url):
'''Yield the links from the html at `url` that start with "chapter".'''
homepage_response = requests.get(url)
soup = BeautifulSoup(homepage_response.text, 'html.parser')
link_elements = soup.find_all('a')
for link_element in link_elements:
if link_element.get('href').startswith('chapter'):
yield urljoin(url, link_element.get('href'))
def get_chapter_image_links(url, homepage):
'''
Yield the urls of images from the html at `url` from the website root
`homepage`.
The images are from `<img>` elements with the class of `chapter-page` and
the urls are in the `data-original` attribute.
'''
chapter_response = requests.get(url)
soup = BeautifulSoup(chapter_response.text, 'html.parser')
chapter_img_elements = soup.find_all('img', class_='chapter-page')
for chapter_img_element in chapter_img_elements:
yield urljoin(homepage, chapter_img_element.get('data-original'))
def download_file(url, destination, log=False):
'''
Download the file at `url` to `destination`.
Source: https://stackoverflow.com/a/13137873
'''
response = requests.get(url, stream=True)
if log:
print('Downloading {} to {}'.format(url, destination))
if response.status_code != 200:
raise Exception(
'Request to {url} returned status code: {code} {reason}'.format(
url=response.url,
code=response.status_code,
reason=response.reason
)
)
with open(destination, 'wb') as destination_file:
response.raw.decode_content = True
shutil.copyfileobj(response.raw, destination_file)
def download_sunbeam(homepage):
'''
Download all the pages of *On a Sunbeam* to the current directory.
`homepage` is the homepage of the comic.
'''
for chapter_link in get_chapter_links(homepage):
for chapter_image_link in get_chapter_image_links(chapter_link, homepage): # noqa: E501
filename = basename(urlparse(chapter_image_link).path)
download_file(chapter_image_link, filename, log=True)
if __name__ == '__main__':
download_sunbeam('http://www.onasunbeam.com/')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment