-
-
Save mahir256/95fcdcb8a38ba9b6c586bc41bf43c8a1 to your computer and use it in GitHub Desktop.
TCIP download script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Usage: | |
# For a particular TCIP book | |
# (such as the one whose viewer URL begins http://access.bl.uk/item/viewer/ark:/81055/vdc_100086389865.0x000001) | |
# run | |
# python3 get_tcip.py vdc_100086389865 | |
import io | |
import requests | |
import socket | |
import os | |
import tqdm | |
from time import sleep | |
from queue import Queue | |
from PIL import Image | |
from sys import argv, exit | |
from threading import Thread | |
def download_tile(): | |
pagenum = 0 | |
image_width = 0 | |
image_height = 0 | |
i = 0 | |
j = 0 | |
image_url = '' | |
while True: | |
(pagenum,image_width,image_height,i,j,image_url) = q.get() | |
while True: | |
try: | |
image_request = requests.get(image_url) | |
im = Image.open(io.BytesIO(image_request.content)) | |
break | |
except Exception as e: | |
print(pagenum,image_width,image_height,i,j,image_url,e) | |
sleep(3) | |
continue | |
new_im.paste(im,(i,j)) | |
q.task_done() | |
concurrent_threads = 8 | |
q = Queue(3*concurrent_threads) | |
for i in range(concurrent_threads): | |
t = Thread(target=download_tile) | |
t.daemon = True | |
t.start() | |
ark_id = argv[1] | |
r = requests.get(f"https://api.bl.uk/metadata/iiif/ark:/81055/{ark_id}.0x000001/manifest.json") | |
rjson = r.json() | |
if(rjson["sequences"] == []): | |
print("No images available for", rjson["metadata"][0]["value"], rjson["metadata"][1]["value"]) | |
exit(0) | |
else: | |
try: | |
os.mkdir(ark_id) | |
except FileExistsError: | |
pass | |
os.chdir(ark_id) | |
pagenums = len(rjson["sequences"][0]["canvases"]) | |
print("Starting", rjson["metadata"][0]["value"], rjson["metadata"][1]["value"]) | |
tq = tqdm.tqdm(range(1,pagenums+1)) | |
for pagenum in tq: | |
toplevelimg = rjson["sequences"][0]["canvases"][pagenum-1]["images"][0]["resource"]["service"]["@id"] | |
out_image = f'image_{pagenum}_.jpg' | |
if(os.path.isfile(out_image)): | |
continue | |
cur_rjson = rjson["sequences"][0]["canvases"][pagenum-1] | |
image_width = cur_rjson['width'] | |
image_height = cur_rjson['height'] | |
tile = max(cur_rjson['images'][0]['resource']['service']['tiles'], key=lambda x: x['width']) | |
tile_width = tile['width'] | |
new_im = Image.new('RGB',(image_width,image_height)) | |
try: | |
for i in range(0,image_width,tile_width): | |
for j in range(0,image_height,tile_width): | |
cur_width = min(tile_width,image_width-i) | |
cur_height = min(tile_width,image_height-j) | |
tile_url = f"{toplevelimg}/{i},{j},{tile_width},{tile_width}/{cur_width},{cur_height}/0/default.jpg" | |
q.put((pagenum,image_width,image_height,i,j,tile_url)) | |
q.join() | |
except KeyboardInterrupt: | |
exit(1) | |
new_im.save(out_image) | |
tq.set_postfix(out_image=out_image) | |
sleep(3) # Adjust this at will. | |
os.chdir("..") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment