Skip to content

Instantly share code, notes, and snippets.

@andylshort
Created March 10, 2018 19:57
Show Gist options
  • Save andylshort/2fa21a204bd8070d84c5f33e80882cca to your computer and use it in GitHub Desktop.
Save andylshort/2fa21a204bd8070d84c5f33e80882cca to your computer and use it in GitHub Desktop.
Small scripts to scrape the beautiful artwork of Simon Stålenhag from his website
import os
import re
import sys
import urllib.error
import urllib.request
if len(sys.argv) != 2:
print("Please specify destination directory as an argument")
sys.exit(-1)
dest_folder = sys.argv[1]
if not os.path.isdir(dest_folder):
print("Destination is not a folder")
sys.exit(-1)
url = "http://www.simonstalenhag.se/"
image_regex = "href=\"bilderbig/([^\\.]*_\\d\\d\\d\\d)\\.jpg\""
# Download the webpage
response = urllib.request.urlopen(url)
webContent = response.read()
# Scrape for image names
matches = set(re.findall(image_regex, str(webContent)))
matches = list(matches)
matches.sort()
if len(matches) > 0:
# Download all large versions of images
for match in matches:
image_url = url + "bilderbig/" + match + ".jpg"
destination = dest_folder + match + ".jpg"
print(image_url)
try:
with urllib.request.urlopen(image_url) as response, open(destination, 'wb') as out_file:
data = response.read()
out_file.write(data)
except urllib.error.HTTPError:
print("Could not download " + image_url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment