Skip to content

Instantly share code, notes, and snippets.

@martjanz
Last active July 16, 2020 17:21
Show Gist options
  • Save martjanz/0ddb6d184079c096f1a777de77b31be4 to your computer and use it in GitHub Desktop.
Save martjanz/0ddb6d184079c096f1a777de77b31be4 to your computer and use it in GitHub Desktop.
Heinrich - Sanguinetti photo archive downloader
"""
Heinrich - Sanguinetti Archive photo downloader
Downloads the photo archive from the Endangered Archives Programme of the British Public Library.
This script assumes that photos are numbered consecutively inside and between folders. There are
some (few) exceptions, a manual review after the run to check if all files were downloaded will be needed.
Sample image URL: http://images.eap.bl.uk/EAP755/EAP755_1_1_295/2987.jp2/full/1287,/0/default.jpg
"""
import os
import time
import urllib.request
BASE_URL = 'http://images.eap.bl.uk/EAP755/EAP755_1_1_'
MAX_RESOLUTION = 10000
MIN_FOLDER = 1
MAX_FOLDER = 300
INITIAL_IMAGE = 1
image = INITIAL_IMAGE
# Iterate over folders from MIN_FOLDER to MAX_FOLDER
for folder in range(MIN_FOLDER, MAX_FOLDER):
print('Now on folder {}...'.format(folder))
# Create folder
# On folder iterate until image gives 404
# import urllib.request
while True:
path = 'photos/EAP755_1_1_{}'.format(folder)
image_path = '{path}/{image}.jpg'.format(path=path, image=image)
if os.path.isfile(image_path):
image = image + 1
continue
url = '{base}{folder}/{photo}.jp2/full/{resolution},/0/default.jpg'.format(
base=BASE_URL, folder=folder, photo=image, resolution=MAX_RESOLUTION)
print('Checking image {} from EAP755_1_1_{}'.format(image, folder))
try:
response = urllib.request.urlopen(url)
except urllib.error.HTTPError as e:
# Check for a skipped image number. If there is, continue on this folder
url = '{base}{folder}/{photo}.jp2/full/{resolution},/0/default.jpg'.format(
base=BASE_URL, folder=folder, photo=image + 1, resolution=MAX_RESOLUTION)
try:
response = urllib.request.urlopen(url)
except urllib.error.HTTPError as e:
time.sleep(1)
break
else:
image = image + 1
print('Skipped image number, continuing on current folder.')
pass
print('Downloading image {} from EAP755_1_1_{}'.format(image, folder))
os.makedirs(path, exist_ok=True)
with open(image_path, 'wb') as out_file:
data = response.read() # a `bytes` object
out_file.write(data)
image = image + 1
time.sleep(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment