Skip to content

Instantly share code, notes, and snippets.

@mahir256
Created June 1, 2021 17:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save mahir256/95fcdcb8a38ba9b6c586bc41bf43c8a1 to your computer and use it in GitHub Desktop.
Save mahir256/95fcdcb8a38ba9b6c586bc41bf43c8a1 to your computer and use it in GitHub Desktop.
TCIP download script
# Usage:
# For a particular TCIP book
# (such as the one whose viewer URL begins http://access.bl.uk/item/viewer/ark:/81055/vdc_100086389865.0x000001)
# run
# python3 get_tcip.py vdc_100086389865
import io
import requests
import socket
import os
import tqdm
from time import sleep
from queue import Queue
from PIL import Image
from sys import argv, exit
from threading import Thread
def download_tile():
pagenum = 0
image_width = 0
image_height = 0
i = 0
j = 0
image_url = ''
while True:
(pagenum,image_width,image_height,i,j,image_url) = q.get()
while True:
try:
image_request = requests.get(image_url)
im = Image.open(io.BytesIO(image_request.content))
break
except Exception as e:
print(pagenum,image_width,image_height,i,j,image_url,e)
sleep(3)
continue
new_im.paste(im,(i,j))
q.task_done()
concurrent_threads = 8
q = Queue(3*concurrent_threads)
for i in range(concurrent_threads):
t = Thread(target=download_tile)
t.daemon = True
t.start()
ark_id = argv[1]
r = requests.get(f"https://api.bl.uk/metadata/iiif/ark:/81055/{ark_id}.0x000001/manifest.json")
rjson = r.json()
if(rjson["sequences"] == []):
print("No images available for", rjson["metadata"][0]["value"], rjson["metadata"][1]["value"])
exit(0)
else:
try:
os.mkdir(ark_id)
except FileExistsError:
pass
os.chdir(ark_id)
pagenums = len(rjson["sequences"][0]["canvases"])
print("Starting", rjson["metadata"][0]["value"], rjson["metadata"][1]["value"])
tq = tqdm.tqdm(range(1,pagenums+1))
for pagenum in tq:
toplevelimg = rjson["sequences"][0]["canvases"][pagenum-1]["images"][0]["resource"]["service"]["@id"]
out_image = f'image_{pagenum}_.jpg'
if(os.path.isfile(out_image)):
continue
cur_rjson = rjson["sequences"][0]["canvases"][pagenum-1]
image_width = cur_rjson['width']
image_height = cur_rjson['height']
tile = max(cur_rjson['images'][0]['resource']['service']['tiles'], key=lambda x: x['width'])
tile_width = tile['width']
new_im = Image.new('RGB',(image_width,image_height))
try:
for i in range(0,image_width,tile_width):
for j in range(0,image_height,tile_width):
cur_width = min(tile_width,image_width-i)
cur_height = min(tile_width,image_height-j)
tile_url = f"{toplevelimg}/{i},{j},{tile_width},{tile_width}/{cur_width},{cur_height}/0/default.jpg"
q.put((pagenum,image_width,image_height,i,j,tile_url))
q.join()
except KeyboardInterrupt:
exit(1)
new_im.save(out_image)
tq.set_postfix(out_image=out_image)
sleep(3) # Adjust this at will.
os.chdir("..")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment