Skip to content

Instantly share code, notes, and snippets.

@jakekara
Created September 4, 2019 20:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jakekara/e75a7c013ee143afbeb6548616aa2401 to your computer and use it in GitHub Desktop.
Save jakekara/e75a7c013ee143afbeb6548616aa2401 to your computer and use it in GitHub Desktop.
Download images from blakearchive
# Download copies of illuminated texts from from Blake Archive
import json
import requests
from PIL import Image
from io import BytesIO
import os
from progress.bar import Bar
def get_copy_objects(objects_url):
return requests.get(objects_url).json()
def gen_url(img_id, dpi=300, base_url="http://www.blakearchive.org/images"):
return "{base_url}/{img_id}.{dpi}.jpg".format(
base_url = base_url,
img_id = img_id,
dpi = dpi
)
def get_copy_image_urls(copy_url):
obj = get_copy_objects(copy_url)["results"]
return [gen_url(x["dbi"], 300) for x in obj]
def get_filename(url):
return os.path.basename(url)
def download_images(url_array, base_path, redownload=False):
image_count = len(url_array)
message = 'Downloading {n} images'.format(n=image_count)
bar = Bar(message, max=image_count)
for url in url_array:
dst = os.path.join(
base_path,
get_filename(url)
)
if (not redownload) and os.path.exists(dst): continue
open(dst,"wb").write(requests.get(url).content)
bar.next()
bar.finish()
def download_images_from_object_index(url, base_path="./", redownload=False):
image_urls = get_copy_image_urls(url)
download_images(
image_urls,
base_path = base_path,
redownload=redownload
)
def get_list_of_copies(url):
return [x["effective_copy_id"] for x in requests.get(url).json()["results"]]
def main():
for copy_id in get_list_of_copies("http://www.blakearchive.org/api/work/songsie/copies"):
dst_folder = "./img/{copy_id}".format(copy_id=copy_id)
if not os.path.exists(dst_folder):
os.makedirs(dst_folder)
print ("Downloading {copy_id} to {dst_folder}".format(copy_id=copy_id, dst_folder=dst_folder))
download_images_from_object_index(
"http://www.blakearchive.org/api/copy/{copy_id}/objects".format(copy_id=copy_id),
base_path=dst_folder
)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment