Skip to content

Instantly share code, notes, and snippets.

@srishilesh
Created January 23, 2022 17:09
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save srishilesh/a2c781a7c6376861844fb7234c5aea11 to your computer and use it in GitHub Desktop.
Save srishilesh/a2c781a7c6376861844fb7234c5aea11 to your computer and use it in GitHub Desktop.
Validate dataset - PASCAL VOC
import xmltodict
import xml.etree.ElementTree as ET
dataset_file = r'/content/sample.xml' # The path to the XML file
xml_tree = ET.parse(dataset_file) # Parse the XML file
root = xml_tree.getroot() # Find the root element
assert root.tag == 'annotation' or root.attrib['verified'] == 'yes', "PASCAL VOC does not contain a root element" # Check if the root element is "annotation"
assert len(root.findtext('folder')) > 0, "XML file does not contain a 'folder' element"
assert len(root.findtext('filename')) > 0, "XML file does not contain a 'filename'"
assert len(root.findtext('path')) > 0, "XML file does not contain 'path' element"
assert len(root.find('source')) == 1 and len(root.find('source').findtext('database')) > 0, "XML file does not contain 'source' element with a 'database'"
assert len(root.find('size')) == 3, "XML file doesn not contain 'size' element"
assert root.find('size').find('width').text and root.find('size').find('height').text and root.find('size').find('depth').text, "XML file does not contain either 'width', 'height', or 'depth' element"
assert root.find('segmented').text == '0' or len(root.find('segmented')) > 0, "'segmented' element is neither 0 or a list"
assert len(root.findall('object')) > 0, "XML file contains no 'object' element" # Check if the root contains zero or more 'objects'
required_objects = ['name', 'pose', 'truncated', 'difficult', 'bndbox'] # All possible meta-data about an object
for obj in root.findall('object'):
assert len(obj.findtext(required_objects[0])) > 0, "Object does not contain a parameter 'name'"
assert len(obj.findtext(required_objects[1])) > 0, "Object does not contain a parameter 'pose'"
assert int(obj.findtext(required_objects[2])) in [0, 1], "Object does not contain a parameter 'truncated'"
assert int(obj.findtext(required_objects[3])) in [0, 1], "Object does not contain a parameter 'difficult'"
assert len(obj.findall(required_objects[4])) > 0, "Object does not contain a parameter 'bndbox'"
for bbox in obj.findall(required_objects[4]):
assert int(bbox.findtext('xmin')) > 0, "'xmin' value for the bounding box is missing "
assert int(bbox.findtext('ymin')) > 0, "'ymin' value for the bounding box is missing "
assert int(bbox.findtext('xmax')) > 0, "'xmax' value for the bounding box is missing "
assert int(bbox.findtext('ymax')) > 0, "'ymax' value for the bounding box is missing "
print('The dataset format is PASCAL VOC!')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment