Skip to content

Instantly share code, notes, and snippets.

Created August 17, 2021 06:32
Show Gist options
  • Save juinquok/f1a0c18947d54d1590391d1af81bbc8e to your computer and use it in GitHub Desktop.
Save juinquok/f1a0c18947d54d1590391d1af81bbc8e to your computer and use it in GitHub Desktop.
Convert Ground Truth Manifest to Pascal VOC XML (Lambda Function)
# This lambda function is triggered by the EventBridge (CloudWatch Events) when a Ground Truth labelling job is marked as completed and an event is received by the EventBridge
# SageMaker Ground Truth Labeling Job State Change where LabelingJobStatus == Completed
# This lambda should then trigger the next lambda function in the sequence
import json
import string
import glob
from pathlib import Path
import boto3
full_template = string.Template("""
bndbox_template = string.Template("""<object>
class BoundingBoxXml:
def __init__(self, label: str, top: int, left: int, height: int, width: int):
self.label = label
self.xmin = left
self.ymin = top
self.xmax = left + width
self.ymax = top + height
self.xml = ""
def generate_bndbox_object(self):
self.xml = bndbox_template.substitute(
label=self.label, xmin=self.xmin, ymin=self.ymin, xmax=self.xmax, ymax=self.ymax)
def lambda_handler(event, context):
# TODO implement
source_bucket_name = <<INSERT YOUR SOURCE BUCKET HERE>>
output_bucket_name = <<INSERT YOUR OUTPUT BUCKET HERE>>
s3_client = boto3.client("s3")
s3_resource = boto3.resource("s3")
bucket = s3_resource.Bucket(source_bucket_name)
storage_annotations_bucket = s3_resource.Bucket(output_bucket_name)
response = s3_client.list_objects(Bucket=bucket_name,
# looping each folder that starts with facemask-detection
for o in response.get('CommonPrefixes'):
annotation_parent_folder = o.get('Prefix')
annotation_json_folder = annotation_parent_folder + \
for json_object in bucket.objects.filter(Prefix=annotation_json_folder):
s3_json_object = s3_client.get_object(
Bucket=source_bucket_name, Key=json_object.key)
dataset_objects = json.loads(
for dataset_object in dataset_objects:
image_uri = Path(dataset_object['dataObject']['s3Uri'])
image_filename =
xml_filename = image_filename.split(".")[0] + ".xml"
# Assume that there is only 1 annotated data since we only have 1 worker
bndbox_string = dataset_object['annotations'][0]['annotationData']['content']
bndbox_json = json.loads(bndbox_string)
bndbox_objects = bndbox_json['annotatedResult']['boundingBoxes']
inner_bndbox_xml = ""
for i in range(len(bndbox_objects)):
bndbox_object = bndbox_objects[i]
bndbox_xml = BoundingBoxXml(bndbox_object['label'],
if i == 0:
inner_bndbox_xml += bndbox_xml.xml
inner_bndbox_xml += '\n\t' + bndbox_xml.xml
image_properties = bndbox_json['annotatedResult']['inputImageProperties']
xml_output = full_template.substitute(
image_filename=image_filename, image_width=image_properties['width'], image_height=image_properties['height'], bounding_box_objects=inner_bndbox_xml)
encoded_string = xml_output.encode("utf-8")
Key=xml_filename, Body=encoded_string)
# to copy over GT annotations folder to another bucket and removing them from current bucket
for obj in bucket.objects.filter(Prefix=annotation_parent_folder):
old_source = {'Bucket': bucket_name,
'Key': obj.key}
new_obj = storage_annotations_bucket.Object(obj.key)
s3_resource.Object(bucket_name, obj.key).delete()
return {
'statusCode': 200,
'body': json.dumps('Successfully converted bounding boxes to XML format')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment