Skip to content

Instantly share code, notes, and snippets.

@irv
Created January 25, 2021 10:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save irv/156f1e8fb80b8a3af57626761ed3a628 to your computer and use it in GitHub Desktop.
Save irv/156f1e8fb80b8a3af57626761ed3a628 to your computer and use it in GitHub Desktop.
Create heatmap from IIIF Image requests (incomplete)
from io import BytesIO
import re
import asyncio
import numpy as np
from aiohttp import ClientSession, ClientPayloadError
import requests
import itertools, operator
from PIL import Image
import cv2
import backoff
def create_image_matrix(width, height, regions):
affected = np.zeros((height, width))
for r in regions:
wrapped_width = min(r['w'], width - r['x'])
wrapped_height = min(r['h'], height - r['y'])
region_matrix = np.full((wrapped_height, wrapped_width), 0.1 * r['count'])
#print(region_matrix)
ypad = max(0, height - wrapped_height)
xpad = max(0, width - wrapped_width)
# pad with 0 up to the size of the source image
base = np.lib.pad(region_matrix,[(r['y'],max(0, ypad - r['y'])), (r['x'],max(0, xpad - r['x']))], 'constant')
affected = np.add(affected, base)
return affected
async def main(loop):
img_api_regex = re.compile("\/(.*)\/((\d+.\d+,\d+,\d+)|(pct:(\d*\.?\d*)+,(\d*\.?\d*)+,(\d*\.?\d*)+,(\d*\.?\d*)+))\/.*\/\d+\/default\.(jpg|gif|png|tif|jp2|webm)")
# parsed_requests = [
# parse_request("/image/iiif/ark:/81055/vdc_100049124055.0x000010/2048,0,1408,2048/704,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100049124055.0x000010/2048,2048,1408,684/704,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100049124055.0x000010/0,2048,2048,684/1024,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100049124055.0x000010/0,0,2048,2048/1024,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/4096,4096,740,1597/47,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1024,1024,1024,1024/47,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1024,1024,512,512/47,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1536,1536,512,512/47,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1024,1024,256,256/47,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1280,1280,256,256/47,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1536,1536,256,256/47,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1536,1536,256,256/47,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1536,1536,256,256/47,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1536,1536,256,256/47,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1536,1536,256,256/47,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1536,1536,256,256/47,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/0,2048,2048,2048/256,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/4096,0,740,4096/47,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/0,4096,4096,1597/256,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/2048,2048,2048,2048/256,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/2048,0,2048,2048/256,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/0,0,2048,2048/256,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/2048,4096,2048,1597/256,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/4096,0,740,2048/93,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/0,4096,2048,1597/256,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/4096,2048,740,2048/93,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/4096,4096,740,1597/93,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/2048,3072,1024,1024/256,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/2048,2048,1024,1024/256,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1024,3072,1024,1024/256,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1024,2048,1024,1024/256,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/3072,2048,1024,1024/256,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/2048,1024,1024,1024/256,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1024,1024,1024,1024/256,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/3072,3072,1024,1024/256,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/3072,1024,1024,1024/256,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/2048,4096,1024,1024/256,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1024,4096,1024,1024/256,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/0,2048,1024,1024/256,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/4096,2048,740,1024/185,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/0,3072,1024,1024/256,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/4096,3072,740,1024/185,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/3072,4096,1024,1024/256,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/0,1024,1024,1024/256,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/2048,0,1024,1024/256,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/4096,1024,740,1024/185,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1024,0,1024,1024/256,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/3072,0,1024,1024/256,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/2048,5120,1024,573/256,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/0,4096,1024,1024/256,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/4096,4096,740,1024/185,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/3072,5120,1024,573/256,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/1024,5120,1024,573/256,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/0,0,1024,1024/256,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/4096,0,740,1024/185,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/4096,5120,740,573/185,/0/default.jpg"),
# parse_request("/image/iiif/ark:/81055/vdc_100060288595.0x000001/0,5120,1024,573/256,/0/default.jpg"),
# #parse_request("/image/iiif/ark:/81055/vdc_100049124055.0x000010/pct:10,10,5,2/704,/0/default.jpg")
# ]
parsed_requests = []
filename = "access.log"
with open(filename, "r") as f:
for cnt, line in enumerate(f):
# only want GET requests
matches = img_api_regex.search(line)
if matches:
bl = matches.group(0)
parsed_requests.append(parse_request(bl))
uri_base = "https://api.bl.uk/image/iiif/ark:/81055"
parsed_requests = parsed_requests[0:200]
image_identifiers = {item[0] for item in parsed_requests}
image_urls = [get_info_url(uri_base, i) for i in image_identifiers]
image_info = await get_info_jsons(image_urls)
grouped_requests = itertools.groupby(parsed_requests, key=operator.itemgetter(0))
grouped_requests = [list(group) for _, group in grouped_requests]
matrixes = []
for i in image_info:
reqs = []
for r in grouped_requests:
# if idenifiers match
if r[0][0] == i[0]:
for s in r:
if s[2]:
# if it was a pct: region, expand to pixels
s[1]['x'] = int((s[1]['x'] / 100 ) * i[1])
s[1]['y'] = int((s[1]['y'] / 100 ) * i[1])
s[1]['w'] = int((s[1]['w'] / 100 ) * i[1])
s[1]['h'] = int((s[1]['h'] / 100 ) * i[1])
reqs.append(s[1])
# try and aggregate same requests
bucketed_requests = [dict(g[0], **{'count':len(list(g[1]))} ) for g in itertools.groupby(reqs)]
matrixes.append((i[0], create_image_matrix(i[1], i[2],bucketed_requests)))
for m in matrixes:
arr = m[1]
# normalise to RGB values
new_arr = np.around(((arr - arr.min()) * (1/(arr.max() - arr.min()) * 255))).astype('uint8')
rgb_img = cv2.applyColorMap(new_arr, cv2.COLORMAP_JET)
bgr_img = cv2.cvtColor(rgb_img, cv2.COLOR_RGB2BGR)
cv2.imwrite("%s.png" % m[0], bgr_img)
async def get_info_jsons(image_urls):
sem = asyncio.Semaphore(1000)
tasks = []
async with ClientSession(conn_timeout=10000, read_timeout=10000) as session:
for url in image_urls:
#pass Semaphore and session to every GET request
task = asyncio.ensure_future(bound_fetch(sem, url, session))
tasks.append(task)
responses = asyncio.gather(*tasks)
return [(info_json['@id'].split('/')[-1], info_json['width'], info_json['height']) for info_json in await responses]
def parse_request(path):
parts = path.split('/')[-5:]
region_string = parts[1]
pct = "pct:" in region_string
if(pct):
region_string = region_string[4:]
region = map(int,region_string.split(','))
identifier = parts[0]
keys = ["x","y","w","h"]
return (identifier, dict(zip(keys,region)), pct)
def get_info_url(uri_base, identifier):
url = "/".join([uri_base, identifier, "info.json"])
return url
#return (identifier, info_json['width'], info_json['height'])
async def bound_fetch(sem, url, session):
# Getter function with semaphore.
async with sem:
return await fetch(url, session)
COUNTER = 1
@backoff.on_predicate(backoff.full_jitter, max_value=13)
@backoff.on_exception(backoff.expo,
ClientPayloadError,
max_time=60)
async def fetch(url, session):
async with session.get(url) as response:
delay = response.headers.get("DELAY")
date = response.headers.get("DATE")
global COUNTER
COUNTER +=1
print("{}. {}:{} with delay {}".format(str(COUNTER), date, response.url, delay))
try:
return await response.json()
except ClientPayloadError:
print("ERROR: {}".format(url))
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main(loop))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment