-
-
Save anthony-sarkis/fd20eac85d52f7d66e26289ac619afd2 to your computer and use it in GitHub Desktop.
cloud convert gcp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import requests | |
import time | |
from PIL import Image, ExifTags | |
import matplotlib.pyplot as plt | |
import matplotlib.patches as patches | |
from random import randrange | |
GCP_BUCKET_NAME = 'cloud-ai-platform-something-something/images' | |
def convert(diffgram_all_data_path): | |
with open(diffgram_all_data_path, 'r') as readfile: | |
all_data = json.load(readfile) | |
label_map = all_data.pop('label_map') | |
all_data.pop('export_info') | |
all_data.pop('attribute_groups_reference') | |
export_data = [] | |
annotation = 0 | |
over = 0 | |
rotated = 0 | |
yissue = 0 | |
xissue = 0 | |
for file_id, file_data in all_data.items(): | |
item = all_data[file_id] | |
file = item.get('file') | |
if not file: | |
continue | |
single_file = { | |
"imageGcsUri": F"gs://{GCP_BUCKET_NAME}/{file_id}.jpg", | |
"boundingBoxAnnotations": [] | |
} | |
width = item.get('image').get('width') | |
height = item.get('image').get('height') | |
# (Could also do streaming here to get this data) | |
# https://diffgram.readme.io/docs/sdk-dataset-list_files | |
instance_list = item.get('instance_list') | |
if not instance_list: continue | |
img = Image.open(f'images/{file_id}.jpg') | |
try: | |
exif = { | |
ExifTags.TAGS[k]: v | |
for k, v in img._getexif().items() | |
if k in ExifTags.TAGS | |
} | |
except: | |
pass | |
#print(exif.get('Orientation')) | |
for instance in instance_list: | |
annotation += 1 | |
if exif.get('Orientation') == 6: | |
instance = rotate_box(instance, width) | |
rotated += 1 | |
## Uncomment to plot example | |
#xMin = instance.get('x_min') | |
#yMin = instance.get('y_max') | |
#xMax = instance.get('x_max') | |
#yMax = instance.get('y_min') | |
#fig, ax = plt.subplots() | |
#ax.imshow(img) | |
#rect = patches.Rectangle((xMin, yMin), xMax-xMin, yMax-yMin, linewidth=1, edgecolor='r', facecolor='none') | |
#ax.add_patch(rect) | |
#plt.show() | |
xMin = instance.get('x_min') / height | |
yMin = instance.get('y_min') / width # swap | |
xMax = instance.get('x_max') / height | |
yMax = instance.get('y_max') / width | |
else: | |
xMin = instance.get('x_min') / width | |
yMin = instance.get('y_min') / height | |
xMax = instance.get('x_max') / width | |
yMax = instance.get('y_max') / height | |
new_item = { | |
"displayName": label_map[str(instance.get('label_file_id'))], | |
"xMin": xMin, | |
"yMin": yMin, | |
"xMax": xMax, | |
"yMax": yMax | |
} | |
#print(new_item) | |
single_file['boundingBoxAnnotations'].append(new_item) | |
if xMin > 1 or yMin > 1 or xMax > 1 or yMax > 1: | |
#new_item['over'] = True | |
over += 1 | |
if yMax < yMin: | |
yissue += 1 | |
if xMax < xMin: | |
xissue += 1 | |
export_data.append(single_file) | |
print(F"Annotations {annotation} Over 1.0 {over} Percent { over / annotation} ") | |
print(F"rotated {rotated} ") | |
print(F"yissue {yissue} xissue {xissue} ") | |
print(len(export_data)) | |
with open(F'google_format_{time.time()}.jsonl', 'w') as outfile: | |
for entry in export_data: | |
json.dump(entry, outfile) | |
outfile.write('\n') | |
def rotate_box(instance:dict, width): | |
# 90 degree counter clockwise | |
old_x_min = instance.get('x_min') | |
old_x_max = instance.get('x_max') | |
instance['x_min'] = instance.get('y_min') | |
instance['x_max'] = instance.get('y_max') | |
instance['y_min'] = width - old_x_max # swap | |
instance['y_max'] = width - old_x_min | |
return instance | |
from threading import Thread | |
from queue import Queue | |
class EventsWorker(Thread): | |
def __init__(self, queue): | |
Thread.__init__(self) | |
self.queue = queue | |
def run(self): | |
while True: | |
path, file_id = self.queue.get() | |
try: | |
download_one(path, file_id) | |
finally: | |
self.queue.task_done() | |
def download_one(path, file_id): | |
r = requests.get(path) | |
with open(F'images/{file_id}.jpg', 'wb') as f: | |
f.write(r.content) | |
def download_to_folder(diffgram_all_data_path): | |
file = open(diffgram_all_data_path) | |
all_data = json.load(file) | |
all_data.pop('export_info') | |
all_data.pop('attribute_groups_reference') | |
queue = Queue() | |
for i in range(25): | |
worker = EventsWorker(queue) | |
worker.daemon = True | |
worker.start() | |
for file_id, file_data in all_data.items(): | |
item = all_data[file_id] | |
file = item.get('file') | |
if not file: | |
continue | |
instance_list = item.get('instance_list') | |
if not instance_list: continue | |
queue.put((file.get('blob_url'), file_id)) | |
queue.join() | |
#download_to_folder("export.json") | |
convert("export.json") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment