Skip to content

Instantly share code, notes, and snippets.

@mahir256
Last active July 4, 2021 04:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save mahir256/26a33c833034995996b0513839c36129 to your computer and use it in GitHub Desktop.
Save mahir256/26a33c833034995996b0513839c36129 to your computer and use it in GitHub Desktop.
EAP download script
# Usage:
# For a range of books in a particular EAP collection,
# such as EAP262-1-1-1 to EAP262-1-1-7072,
# run
# python3 get_british.py 262 1 1 1 7072
#
# What matters is that the last two arguments are the start and end indices to download:
# the numbers preceding it will simply be joined in forming each EAP identifier.
# For a range like EAP341-1-408 to EAP341-1-409, you could therefore run
# python3 get_british.py 341 1 408 409
# or to download only EAP127-6-3, you can run
# python3 get_british.py 127 6 3 3
import io
import socket
import os
import requests
import tqdm
import img2pdf
from time import sleep
from queue import Queue
from PIL import Image
from sys import argv, exit
from threading import Thread
def download_tile():
pagenum = 0
image_width = 0
image_height = 0
i = 0
j = 0
image_url = ''
while True:
(pagenum,image_width,image_height,i,j,image_url) = q.get()
while True:
try:
image_request = requests.get(image_url)
im = Image.open(io.BytesIO(image_request.content))
break
except Exception as e:
print(pagenum,image_width,image_height,i,j,image_url,e)
sleep(3)
continue
new_im.paste(im,(i,j))
q.task_done()
concurrent_threads = 8
q = Queue(3*concurrent_threads)
for i in range(concurrent_threads):
t = Thread(target=download_tile)
t.daemon = True
t.start()
for k in range(int(argv[-2]),int(argv[-1])+1):
eapnum = argv[1]
eapindices = argv[2:-2]
eapindices.append(str(k))
collectionid = "EAP"+eapnum
bliddash = collectionid+"-"+"-".join(eapindices)
blidunder = collectionid+'/'+collectionid+"_"+"_".join(eapindices)
r = requests.get(f"https://eap.bl.uk/archive-file/{bliddash}/manifest")
rjson = r.json()
if(rjson["sequences"] == []):
print("No images available for", rjson["metadata"][0]["value"], rjson["metadata"][1]["value"])
continue
else:
try:
os.mkdir(str(k))
except FileExistsError:
pass
os.chdir(str(k))
pagenums = len(rjson["sequences"][0]["canvases"])
print("Starting", rjson["metadata"][0]["value"], rjson["metadata"][1]["value"])
tq = tqdm.tqdm(range(1,pagenums+1))
out_images = []
for pagenum in tq:
toplevelimg = rjson["sequences"][0]["canvases"][pagenum-1]["images"][0]["resource"]["service"]["@id"]
out_image = f'image_{pagenum}_.jpg'
if(os.path.isfile(out_image)):
continue
cur_rjson = rjson["sequences"][0]["canvases"][pagenum-1]
image_width = cur_rjson['width']
image_height = cur_rjson['height']
tile = max(cur_rjson['images'][0]['resource']['service']['tiles'], key=lambda x: x['width'])
tile_width = tile['width']
new_im = Image.new('RGB',(image_width,image_height))
try:
for i in range(0,image_width,tile_width):
for j in range(0,image_height,tile_width):
cur_width = min(tile_width,image_width-i)
cur_height = min(tile_width,image_height-j)
tile_url = f"{toplevelimg}/{i},{j},{tile_width},{tile_width}/{cur_width},{cur_height}/0/default.jpg"
q.put((pagenum,image_width,image_height,i,j,tile_url))
q.join()
except KeyboardInterrupt:
exit(1)
new_im.save(out_image)
tq.set_postfix(out_image=out_image)
out_images.append(out_image)
sleep(3) # Adjust this at will.
with open('../' + bliddash + " - " + rjson["metadata"][1]["value"].replace('/','') + ".pdf","wb") as f:
try:
f.write(img2pdf.convert(out_images))
for img in out_images:
os.remove(img)
except Exception as e:
print("error when writing PDF file", e)
os.chdir("..")
os.rmdir(str(k))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment