Skip to content

Instantly share code, notes, and snippets.

@APadierna
Last active November 14, 2021 14:58
Show Gist options
  • Save APadierna/d5a12a301b318397a7ed to your computer and use it in GitHub Desktop.
Save APadierna/d5a12a301b318397a7ed to your computer and use it in GitHub Desktop.
Script to download The dilbert comic strips
#!/usr/bin/env python
"""
Simple script to download the Dilbert comic strips in a defined period of time
If no arguments are passed to the script, it will download all the Dilbert comic
strips in the current folder (It may take a while).
Acknowledgments
---------------
This script is strongly based in the work from:
https://community.spiceworks.com/scripts/show/982-download-all-dilbert-comics
"""
import datetime
import os
import re
import sys
import time
import argparse
from dateutil import rrule, parser
# for backwards compatibility
if sys.version_info[0] > 2:
import urllib.request as ul
else:
import urllib as ul
def main():
args = parse_input_arguments()
# If a dump folder has been defiled, create if (if does not already exists)
# and move to it
try:
if args.output != '.' and not(os.path.isdir(args.output)):
os.makedirs(args.output)
except:
args.output = '.'
os.chdir(args.output)
download_strips(args.start_date, args.end_date)
def parse_input_arguments():
argp = argparse.ArgumentParser(description='Dilbert strips download script.')
argp.add_argument("-s", "--start",
help="start date (1989-04-17, 1st published strip).",
dest="start_date",
default='1989-04-17')
argp.add_argument("-e", "--end",
dest="end_date",
help="End date (default, today)",
default=None)
argp.add_argument("-o", "--output",
dest="output",
help="Comics dump folder",
default='.')
args = argp.parse_args()
if args.end_date is None:
args.end_date = datetime.datetime.now().date()
else:
args.end_date = parser.parse(args.end_date)
args.start_date = parser.parse(args.start_date)
return args
def download_strips(start_date, end_date):
for date in list(rrule.rrule(rrule.DAILY, dtstart=start_date, until=end_date)):
comic_date = '%04d-%02d-%02d' % (date.year, date.month, date.day)
url = 'http://dilbert.com/strip/' + comic_date
comic_name = comic_date + '.jpg'
print('getting comic from', comic_date)
ul.urlretrieve(get_true_comic_url(url), comic_name)
time.sleep(0.01)
def get_true_comic_url(comic_url, comic_name='comic'):
"""
get the true comic strip url from http://dilbert.com/strip/<date>
It looks like Scott Adams has protected himself against pointy haired
pirates by hiding him comic strips within the assets.amuniversal domain.
This function digs into the comic strip web-page, finds (and returns)
the URL where the original image lives.
"""
html=str(ul.urlopen(comic_url).read())
comic_strip_pattern = 'https://assets\.amuniversal\.com/[a-zA-Z\d]+'
return re.search(comic_strip_pattern, html).group()
if __name__ == '__main__':
main()
@LwsBtlr
Copy link

LwsBtlr commented Nov 12, 2021

line 92 should be:

comic_strip_pattern = 'https://assets\.amuniversal\.com/[a-zA-Z\d]+'

@APadierna
Copy link
Author

I'm afraid the script got a bit out of date since it was written. Thanks for pointing out the change from http to https !! :-D

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment