Skip to content

Instantly share code, notes, and snippets.

@shubh-agrawal
Created September 26, 2016 18:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shubh-agrawal/2ec69019b056941882d62c3c102f243b to your computer and use it in GitHub Desktop.
Save shubh-agrawal/2ec69019b056941882d62c3c102f243b to your computer and use it in GitHub Desktop.
# arguement required.
# Pre-requisites: 1. Format the dataset as explained by deboc
# 2. create the Imageset text file using 'ls Annotations/ -m | sed s/\\s/\\n/g | sed s/.xml//g | sed s/,//g > ImageSets/train.txt'
# 1st arguement = Imageset file name Example: train.txt
# 2nd arguement = path to custom dataset Example: ~/py-faster-rcnn/data/VOC_devkit/
# Using Instructions
# $ python discard_shuffle_data.py [arg1] [arg2]
import numpy as np
import random, sys, os
import cv2
import xml.etree.ElementTree as ET
ext=['.png','.jpg','.jpeg']
discard_list=[]
def get_image_path_from_index(index):
"""
Construct an image path from the image file name form annotation path to be given to cv function for bbox verification
"""
for item in ext:
image_path = os.path.join(data_path, 'data', 'Images', index + item)
if os.path.exists(image_path):
break
assert os.path.exists(image_path), \
'Path does not exist: {}'.format(image_path)
return image_path
def get_annotation_path_from_index(index):
"returns annotation path"
annotation_path=os.path.join(data_path, 'data', 'Annotations',index + '.xml')
assert os.path.exists(annotation_path), \
'Path does not exist: {}'.format(annotation_path)
return annotation_path
def get_image_size(image_path):
"output image height and width of input image"
img = cv2.imread(image_path,0)
height, width = img.shape[:2]
def get_bbox_size():
"will return bbox size which will be used to compare to image_size heigth and width"
"xml parsed here"
imageset_filename=sys.argv[1]
data_path=sys.argv[2]
imageset_file_path = os.path.join(data_path, 'data', 'ImageSets', imageset_filename)
assert os.path.exists(imageset_file_path), \
'Path does not exist: {}'.format(imageset_file_path)
lines = open(imageset_file_path).readlines()
for line in lines:
image_index=line.strip() #should not contain newline character
image_path=get_image_path_from_index(image_index)
annotation_path=get_annotation_path_from_index(image_index)
tree=ET.parse(annotation_path)
img_max_width = int(tree.find('.//width').text)
img_max_height= int(tree.find('.//height').text)
objs=tree.findall('object')
for obj in objs:
bbox = obj.find('bndbox')
x1 = int(bbox.find('xmin').text) - 1
y1 = int(bbox.find('ymin').text) - 1
x2 = int(bbox.find('xmax').text) - 1
y2 = int(bbox.find('ymax').text) - 1
box=[x1,y1,x2,y2]
#print box
if x1 < -1 or y1 < -1 or (x2-x1) > img_max_width or (y2-y1) > img_max_height:
# Conditional loop for checking the error in bbox from dataset
discard_list.append(image_index)
print "Discarded image index : ",image_index
break
print "No. of discarded indexes : ",len(discard_list)
random.shuffle(lines) # Shuffles the lines in the list "lines"
text_file = open(imageset_file_path, "w")
for line in lines: # Check if line belongs to discard_list. If yes, that image index is not written
if line.strip() not in discard_list:
text_file.write(line)
text_file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment