anthony-sarkis/convert.py Secret

## convert.py
import json
import requests
import time
from PIL import Image, ExifTags
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from random import randrange

GCP_BUCKET_NAME = 'cloud-ai-platform-something-something/images'

def convert(diffgram_all_data_path):

    with open(diffgram_all_data_path, 'r') as readfile:
        all_data = json.load(readfile)

    label_map = all_data.pop('label_map')
    all_data.pop('export_info')
    all_data.pop('attribute_groups_reference')

    export_data = []
    annotation = 0
    over = 0
    rotated = 0
    yissue = 0
    xissue = 0

    for file_id, file_data in all_data.items():

        item = all_data[file_id]
        file = item.get('file')
        if not file:
            continue

        single_file = {
            "imageGcsUri": F"gs://{GCP_BUCKET_NAME}/{file_id}.jpg",
            "boundingBoxAnnotations": []
        }
        width = item.get('image').get('width')
        height = item.get('image').get('height')
        # (Could also do streaming here to get this data)
        # https://diffgram.readme.io/docs/sdk-dataset-list_files

        instance_list = item.get('instance_list')
        if not instance_list: continue

        img = Image.open(f'images/{file_id}.jpg')

        try:
            exif = {
                ExifTags.TAGS[k]: v
                for k, v in img._getexif().items()
                if k in ExifTags.TAGS
            }
        except:
            pass
        #print(exif.get('Orientation'))

        for instance in instance_list:

            annotation += 1

            if exif.get('Orientation') == 6:
                instance = rotate_box(instance, width)
                rotated += 1

                ## Uncomment to plot example

                #xMin = instance.get('x_min')
                #yMin = instance.get('y_max')
                #xMax = instance.get('x_max')
                #yMax = instance.get('y_min')

                #fig, ax = plt.subplots()
                #ax.imshow(img)
                #rect = patches.Rectangle((xMin, yMin), xMax-xMin, yMax-yMin, linewidth=1, edgecolor='r', facecolor='none')
                #ax.add_patch(rect)
                #plt.show()

                xMin = instance.get('x_min') / height
                yMin = instance.get('y_min') / width        # swap
                xMax = instance.get('x_max') / height
                yMax = instance.get('y_max') / width

            else:
                xMin = instance.get('x_min') / width
                yMin = instance.get('y_min') / height
                xMax = instance.get('x_max') / width
                yMax = instance.get('y_max') / height

            new_item = {
                "displayName": label_map[str(instance.get('label_file_id'))],
                "xMin": xMin,
                "yMin": yMin,
                "xMax": xMax,
                "yMax": yMax
                }
            #print(new_item)
            single_file['boundingBoxAnnotations'].append(new_item)

            if xMin > 1 or yMin > 1 or xMax > 1 or yMax > 1:
                #new_item['over'] = True
                over += 1
            if yMax < yMin:
                yissue += 1
            if xMax < xMin:
                xissue += 1


        export_data.append(single_file)

    print(F"Annotations {annotation}  Over 1.0 {over}   Percent { over / annotation}   ")
    print(F"rotated {rotated} ")
    print(F"yissue {yissue} xissue {xissue} ")
    print(len(export_data))

    with open(F'google_format_{time.time()}.jsonl', 'w') as outfile:
        for entry in export_data:
            json.dump(entry, outfile)
            outfile.write('\n')


def rotate_box(instance:dict, width):
    # 90 degree counter clockwise
    old_x_min = instance.get('x_min')
    old_x_max = instance.get('x_max')
    instance['x_min'] = instance.get('y_min')
    instance['x_max'] = instance.get('y_max')
    instance['y_min'] = width - old_x_max   # swap
    instance['y_max'] = width - old_x_min
    return instance

from threading import Thread
from queue import Queue

class EventsWorker(Thread):

    def __init__(self, queue):
        Thread.__init__(self)
        self.queue = queue

    def run(self):
        while True:
            path, file_id = self.queue.get()
            try:
                download_one(path, file_id)
            finally:
                self.queue.task_done()

def download_one(path, file_id):
    r = requests.get(path)
    with open(F'images/{file_id}.jpg', 'wb') as f:
        f.write(r.content)

def download_to_folder(diffgram_all_data_path):

    file = open(diffgram_all_data_path)
    all_data = json.load(file)
    all_data.pop('export_info')
    all_data.pop('attribute_groups_reference')

    queue = Queue()
    for i in range(25):
	    worker = EventsWorker(queue)
	    worker.daemon = True
	    worker.start()

    for file_id, file_data in all_data.items():

        item = all_data[file_id]
        file = item.get('file')
        if not file:
            continue
        instance_list = item.get('instance_list')
        if not instance_list: continue

        queue.put((file.get('blob_url'), file_id))

    queue.join()

#download_to_folder("export.json")
convert("export.json")
	import json
	import requests
	import time
	from PIL import Image, ExifTags
	import matplotlib.pyplot as plt
	import matplotlib.patches as patches
	from random import randrange

	GCP_BUCKET_NAME = 'cloud-ai-platform-something-something/images'

	def convert(diffgram_all_data_path):

	with open(diffgram_all_data_path, 'r') as readfile:
	all_data = json.load(readfile)

	label_map = all_data.pop('label_map')
	all_data.pop('export_info')
	all_data.pop('attribute_groups_reference')

	export_data = []
	annotation = 0
	over = 0
	rotated = 0
	yissue = 0
	xissue = 0

	for file_id, file_data in all_data.items():

	item = all_data[file_id]
	file = item.get('file')
	if not file:
	continue

	single_file = {
	"imageGcsUri": F"gs://{GCP_BUCKET_NAME}/{file_id}.jpg",
	"boundingBoxAnnotations": []
	}
	width = item.get('image').get('width')
	height = item.get('image').get('height')
	# (Could also do streaming here to get this data)
	# https://diffgram.readme.io/docs/sdk-dataset-list_files

	instance_list = item.get('instance_list')
	if not instance_list: continue

	img = Image.open(f'images/{file_id}.jpg')

	try:
	exif = {
	ExifTags.TAGS[k]: v
	for k, v in img._getexif().items()
	if k in ExifTags.TAGS
	}
	except:
	pass
	#print(exif.get('Orientation'))

	for instance in instance_list:

	annotation += 1

	if exif.get('Orientation') == 6:
	instance = rotate_box(instance, width)
	rotated += 1

	## Uncomment to plot example

	#xMin = instance.get('x_min')
	#yMin = instance.get('y_max')
	#xMax = instance.get('x_max')
	#yMax = instance.get('y_min')

	#fig, ax = plt.subplots()
	#ax.imshow(img)
	#rect = patches.Rectangle((xMin, yMin), xMax-xMin, yMax-yMin, linewidth=1, edgecolor='r', facecolor='none')
	#ax.add_patch(rect)
	#plt.show()

	xMin = instance.get('x_min') / height
	yMin = instance.get('y_min') / width # swap
	xMax = instance.get('x_max') / height
	yMax = instance.get('y_max') / width

	else:
	xMin = instance.get('x_min') / width
	yMin = instance.get('y_min') / height
	xMax = instance.get('x_max') / width
	yMax = instance.get('y_max') / height

	new_item = {
	"displayName": label_map[str(instance.get('label_file_id'))],
	"xMin": xMin,
	"yMin": yMin,
	"xMax": xMax,
	"yMax": yMax
	}
	#print(new_item)
	single_file['boundingBoxAnnotations'].append(new_item)

	if xMin > 1 or yMin > 1 or xMax > 1 or yMax > 1:
	#new_item['over'] = True
	over += 1
	if yMax < yMin:
	yissue += 1
	if xMax < xMin:
	xissue += 1


	export_data.append(single_file)

	print(F"Annotations {annotation} Over 1.0 {over} Percent { over / annotation} ")
	print(F"rotated {rotated} ")
	print(F"yissue {yissue} xissue {xissue} ")
	print(len(export_data))

	with open(F'google_format_{time.time()}.jsonl', 'w') as outfile:
	for entry in export_data:
	json.dump(entry, outfile)
	outfile.write('\n')


	def rotate_box(instance:dict, width):
	# 90 degree counter clockwise
	old_x_min = instance.get('x_min')
	old_x_max = instance.get('x_max')
	instance['x_min'] = instance.get('y_min')
	instance['x_max'] = instance.get('y_max')
	instance['y_min'] = width - old_x_max # swap
	instance['y_max'] = width - old_x_min
	return instance

	from threading import Thread
	from queue import Queue

	class EventsWorker(Thread):

	def __init__(self, queue):
	Thread.__init__(self)
	self.queue = queue

	def run(self):
	while True:
	path, file_id = self.queue.get()
	try:
	download_one(path, file_id)
	finally:
	self.queue.task_done()

	def download_one(path, file_id):
	r = requests.get(path)
	with open(F'images/{file_id}.jpg', 'wb') as f:
	f.write(r.content)

	def download_to_folder(diffgram_all_data_path):

	file = open(diffgram_all_data_path)
	all_data = json.load(file)
	all_data.pop('export_info')
	all_data.pop('attribute_groups_reference')

	queue = Queue()
	for i in range(25):
	worker = EventsWorker(queue)
	worker.daemon = True
	worker.start()

	for file_id, file_data in all_data.items():

	item = all_data[file_id]
	file = item.get('file')
	if not file:
	continue
	instance_list = item.get('instance_list')
	if not instance_list: continue

	queue.put((file.get('blob_url'), file_id))

	queue.join()

	#download_to_folder("export.json")
	convert("export.json")