-
-
Save aebadirad/92bea61f06900042bbf8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# Import all the things (we need) | |
from datetime import timedelta | |
from datetime import date | |
from requests import Session | |
from wand.image import Image | |
import time | |
import os.path | |
# Initialize the downloader session | |
r = Session() | |
# Define all the functions (again, those we need) | |
def get_single_strip(strip_date): | |
# This function downloads a single strip for the given date, and saves it to a file. | |
# We'll begin by setting the filename in which to save our file. We'll follow the | |
# format "Dilbert-YYYY-MM-DD.png" | |
save_as = "Dilbert-%04d-%02d-%02d.png" % (strip_date.year, strip_date.month, strip_date.day) | |
save_as = os.path.join(str(strip_date.year), save_as) | |
# At this point, we might as well go ahead and check if we have the file, and if we do, | |
# not waste time in downloading it again | |
if os.path.isfile(save_as): | |
print("%s - Already Have" % save_as) | |
return 2 | |
# Create the directory for the year if it does not exist | |
if not os.path.exists(str(strip_date.year)): | |
os.makedirs(str(strip_date.year)) | |
# The URL to access the webpage which holds the strip for the required date is | |
# http://dilbert.com/fast/YYYY-MM-DD, so we'll also build that into a variable | |
source_page_url = "http://dilbert.com/fast/%04d-%02d-%02d" % (strip_date.year, strip_date.month, strip_date.day) | |
# Let's start by actually downloading that page. | |
source_page = r.get(source_page_url) | |
if not source_page.ok: | |
return 0 | |
# Now we have to do a bit of text processing. It appears that the URL to the actual image is in a | |
# string that starts with the letters "dyn/". It also appears that the URL generally starts around | |
# the 1000th character or so in the page. Armed with that information, we can try to find the URL | |
# to the image, extract it, and download it | |
# The above is no longer true. It now uses this server to feed it the info with a hash value for | |
# the name of the image file | |
start_pos = source_page.text.find("http://assets.amuniversal.com/") | |
if (start_pos == -1): | |
return 0 | |
end_pos = start_pos+62 | |
# these positions below are always accurate, but just incase someone changes something minor | |
# in the head of the html document on dilbert.com, best not to use it...though this is faster | |
#start_pos = 3533 | |
#end_pos = 3595 | |
# It turns out that sometimes, we might get a placeholder image instead of a real strip. | |
# The URL to the placeholder is always "dyn/str_strip/default_th.gif". Our text processing | |
# has only grabbed the "dyn/str_strip/default_th." part, whose length is 25 characters. We | |
# can check our URL length, and if it's 25 characters, give up. This will never turn up a | |
# false positive, because URLs to real strips are really long | |
#if source_page.text[start_pos:end_pos].__len__() == 25: | |
# return 0 | |
# Now we'll construct the URL string | |
strip_url = source_page.text[start_pos:end_pos] | |
# Now that we have the URL, why not download it? We'll download it, but not save it. We'll | |
# keep it in memory until we convert it to a PNG, and we'll dump the converted PNG to disk | |
gif_image = r.get(strip_url) | |
if not gif_image.ok: | |
return 0 | |
# We'll now do the conversion to PNG and write to disk in one step | |
with Image(blob = gif_image.content, format = "gif") as img: | |
img.format = "png" | |
img.save(filename = save_as) | |
# This was successful, so return True | |
print("%s - Downloaded" % save_as) | |
return 1 | |
def sync_strips(): | |
# The first Dilbert comic was released on 17th April 1989. Ah, those innocent days... | |
start_date = date(1989, 4, 17) | |
# We want all the Dilbert, so the end date is today | |
end_date = date.today() | |
# And how many days is that? | |
total_days = int((end_date - start_date).days) | |
# We're ready to start downloading all the Dilbert, so... | |
failed_files_log = open("failed.txt", "w") | |
for single_date in ((start_date + timedelta(n)) for n in range(total_days + 1)): | |
ret = get_single_strip(single_date) | |
if not ret: | |
failed_files_log.write("".join((str(single_date), "\n"))) | |
failed_files_log.flush() | |
elif ret == 1: | |
time.sleep(0.75) | |
failed_files_log.close() | |
# Yeah, we're sleeping for 0.75 seconds before downloading the next strip to stop being | |
# blocked by the server. Anyway, we're done | |
if __name__ == "__main__": | |
sync_strips() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment