-
-
Save mahir256/26a33c833034995996b0513839c36129 to your computer and use it in GitHub Desktop.
EAP download script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Usage: | |
# For a range of books in a particular EAP collection, | |
# such as EAP262-1-1-1 to EAP262-1-1-7072, | |
# run | |
# python3 get_british.py 262 1 1 1 7072 | |
# | |
# What matters is that the last two arguments are the start and end indices to download: | |
# the numbers preceding it will simply be joined in forming each EAP identifier. | |
# For a range like EAP341-1-408 to EAP341-1-409, you could therefore run | |
# python3 get_british.py 341 1 408 409 | |
# or to download only EAP127-6-3, you can run | |
# python3 get_british.py 127 6 3 3 | |
import io | |
import socket | |
import os | |
import requests | |
import tqdm | |
import img2pdf | |
from time import sleep | |
from queue import Queue | |
from PIL import Image | |
from sys import argv, exit | |
from threading import Thread | |
def download_tile(): | |
pagenum = 0 | |
image_width = 0 | |
image_height = 0 | |
i = 0 | |
j = 0 | |
image_url = '' | |
while True: | |
(pagenum,image_width,image_height,i,j,image_url) = q.get() | |
while True: | |
try: | |
image_request = requests.get(image_url) | |
im = Image.open(io.BytesIO(image_request.content)) | |
break | |
except Exception as e: | |
print(pagenum,image_width,image_height,i,j,image_url,e) | |
sleep(3) | |
continue | |
new_im.paste(im,(i,j)) | |
q.task_done() | |
concurrent_threads = 8 | |
q = Queue(3*concurrent_threads) | |
for i in range(concurrent_threads): | |
t = Thread(target=download_tile) | |
t.daemon = True | |
t.start() | |
for k in range(int(argv[-2]),int(argv[-1])+1): | |
eapnum = argv[1] | |
eapindices = argv[2:-2] | |
eapindices.append(str(k)) | |
collectionid = "EAP"+eapnum | |
bliddash = collectionid+"-"+"-".join(eapindices) | |
blidunder = collectionid+'/'+collectionid+"_"+"_".join(eapindices) | |
r = requests.get(f"https://eap.bl.uk/archive-file/{bliddash}/manifest") | |
rjson = r.json() | |
if(rjson["sequences"] == []): | |
print("No images available for", rjson["metadata"][0]["value"], rjson["metadata"][1]["value"]) | |
continue | |
else: | |
try: | |
os.mkdir(str(k)) | |
except FileExistsError: | |
pass | |
os.chdir(str(k)) | |
pagenums = len(rjson["sequences"][0]["canvases"]) | |
print("Starting", rjson["metadata"][0]["value"], rjson["metadata"][1]["value"]) | |
tq = tqdm.tqdm(range(1,pagenums+1)) | |
out_images = [] | |
for pagenum in tq: | |
toplevelimg = rjson["sequences"][0]["canvases"][pagenum-1]["images"][0]["resource"]["service"]["@id"] | |
out_image = f'image_{pagenum}_.jpg' | |
if(os.path.isfile(out_image)): | |
continue | |
cur_rjson = rjson["sequences"][0]["canvases"][pagenum-1] | |
image_width = cur_rjson['width'] | |
image_height = cur_rjson['height'] | |
tile = max(cur_rjson['images'][0]['resource']['service']['tiles'], key=lambda x: x['width']) | |
tile_width = tile['width'] | |
new_im = Image.new('RGB',(image_width,image_height)) | |
try: | |
for i in range(0,image_width,tile_width): | |
for j in range(0,image_height,tile_width): | |
cur_width = min(tile_width,image_width-i) | |
cur_height = min(tile_width,image_height-j) | |
tile_url = f"{toplevelimg}/{i},{j},{tile_width},{tile_width}/{cur_width},{cur_height}/0/default.jpg" | |
q.put((pagenum,image_width,image_height,i,j,tile_url)) | |
q.join() | |
except KeyboardInterrupt: | |
exit(1) | |
new_im.save(out_image) | |
tq.set_postfix(out_image=out_image) | |
out_images.append(out_image) | |
sleep(3) # Adjust this at will. | |
with open('../' + bliddash + " - " + rjson["metadata"][1]["value"].replace('/','') + ".pdf","wb") as f: | |
try: | |
f.write(img2pdf.convert(out_images)) | |
for img in out_images: | |
os.remove(img) | |
except Exception as e: | |
print("error when writing PDF file", e) | |
os.chdir("..") | |
os.rmdir(str(k)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment