APadierna/dilbert.py

## dilbert.py
#!/usr/bin/env python
"""
Simple script to download the Dilbert comic strips in a defined period of time

If no arguments are passed to the script, it will download all the Dilbert comic
strips in the current folder (It may take a while).

Acknowledgments
---------------
This script is strongly based in the work from:
https://community.spiceworks.com/scripts/show/982-download-all-dilbert-comics
"""

import datetime
import os
import re
import sys
import time
import argparse
from dateutil import rrule, parser

# for backwards compatibility
if sys.version_info[0] > 2:
    import urllib.request as ul
else:
    import urllib as ul


def main():
    args = parse_input_arguments()

    # If a dump folder has been defiled, create if (if does not already exists)
    # and move to it
    try:
        if args.output != '.' and not(os.path.isdir(args.output)):
            os.makedirs(args.output)
    except:
        args.output = '.'
    os.chdir(args.output)

    download_strips(args.start_date, args.end_date)


def parse_input_arguments():
    argp = argparse.ArgumentParser(description='Dilbert strips download script.')
    argp.add_argument("-s", "--start",
                        help="start date (1989-04-17, 1st published strip).",
                        dest="start_date",
                        default='1989-04-17')
    argp.add_argument("-e", "--end",
                        dest="end_date",
                        help="End date (default, today)",
                        default=None)
    argp.add_argument("-o", "--output",
                        dest="output",
                        help="Comics dump folder",
                        default='.')

    args = argp.parse_args()
    if args.end_date is None:
        args.end_date = datetime.datetime.now().date()
    else:
        args.end_date = parser.parse(args.end_date)
    args.start_date = parser.parse(args.start_date)

    return args


def download_strips(start_date, end_date):
    for date in list(rrule.rrule(rrule.DAILY, dtstart=start_date, until=end_date)):
        comic_date = '%04d-%02d-%02d' % (date.year, date.month, date.day)
        url  = 'http://dilbert.com/strip/' + comic_date
        comic_name = comic_date + '.jpg'
        print('getting comic from', comic_date)
        ul.urlretrieve(get_true_comic_url(url), comic_name)
        time.sleep(0.01)


def get_true_comic_url(comic_url, comic_name='comic'):
    """
    get the true comic strip url from http://dilbert.com/strip/<date>

    It looks like Scott Adams has protected himself against pointy haired
    pirates by hiding him comic strips within the assets.amuniversal domain.
    This function digs into the comic strip web-page, finds (and returns)
    the URL where the original image lives.
    """

    html=str(ul.urlopen(comic_url).read())
    comic_strip_pattern = 'https://assets\.amuniversal\.com/[a-zA-Z\d]+'
    return re.search(comic_strip_pattern, html).group()


if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	"""
	Simple script to download the Dilbert comic strips in a defined period of time

	If no arguments are passed to the script, it will download all the Dilbert comic
	strips in the current folder (It may take a while).

	Acknowledgments
	---------------
	This script is strongly based in the work from:
	https://community.spiceworks.com/scripts/show/982-download-all-dilbert-comics
	"""

	import datetime
	import os
	import re
	import sys
	import time
	import argparse
	from dateutil import rrule, parser

	# for backwards compatibility
	if sys.version_info[0] > 2:
	import urllib.request as ul
	else:
	import urllib as ul


	def main():
	args = parse_input_arguments()

	# If a dump folder has been defiled, create if (if does not already exists)
	# and move to it
	try:
	if args.output != '.' and not(os.path.isdir(args.output)):
	os.makedirs(args.output)
	except:
	args.output = '.'
	os.chdir(args.output)

	download_strips(args.start_date, args.end_date)


	def parse_input_arguments():
	argp = argparse.ArgumentParser(description='Dilbert strips download script.')
	argp.add_argument("-s", "--start",
	help="start date (1989-04-17, 1st published strip).",
	dest="start_date",
	default='1989-04-17')
	argp.add_argument("-e", "--end",
	dest="end_date",
	help="End date (default, today)",
	default=None)
	argp.add_argument("-o", "--output",
	dest="output",
	help="Comics dump folder",
	default='.')

	args = argp.parse_args()
	if args.end_date is None:
	args.end_date = datetime.datetime.now().date()
	else:
	args.end_date = parser.parse(args.end_date)
	args.start_date = parser.parse(args.start_date)

	return args


	def download_strips(start_date, end_date):
	for date in list(rrule.rrule(rrule.DAILY, dtstart=start_date, until=end_date)):
	comic_date = '%04d-%02d-%02d' % (date.year, date.month, date.day)
	url = 'http://dilbert.com/strip/' + comic_date
	comic_name = comic_date + '.jpg'
	print('getting comic from', comic_date)
	ul.urlretrieve(get_true_comic_url(url), comic_name)
	time.sleep(0.01)


	def get_true_comic_url(comic_url, comic_name='comic'):
	"""
	get the true comic strip url from http://dilbert.com/strip/<date>

	It looks like Scott Adams has protected himself against pointy haired
	pirates by hiding him comic strips within the assets.amuniversal domain.
	This function digs into the comic strip web-page, finds (and returns)
	the URL where the original image lives.
	"""

	html=str(ul.urlopen(comic_url).read())
	comic_strip_pattern = 'https://assets\.amuniversal\.com/[a-zA-Z\d]+'
	return re.search(comic_strip_pattern, html).group()


	if __name__ == '__main__':
	main()