Created
January 23, 2022 17:09
-
-
Save srishilesh/a2c781a7c6376861844fb7234c5aea11 to your computer and use it in GitHub Desktop.
Validate dataset - PASCAL VOC
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import xmltodict | |
import xml.etree.ElementTree as ET | |
dataset_file = r'/content/sample.xml' # The path to the XML file | |
xml_tree = ET.parse(dataset_file) # Parse the XML file | |
root = xml_tree.getroot() # Find the root element | |
assert root.tag == 'annotation' or root.attrib['verified'] == 'yes', "PASCAL VOC does not contain a root element" # Check if the root element is "annotation" | |
assert len(root.findtext('folder')) > 0, "XML file does not contain a 'folder' element" | |
assert len(root.findtext('filename')) > 0, "XML file does not contain a 'filename'" | |
assert len(root.findtext('path')) > 0, "XML file does not contain 'path' element" | |
assert len(root.find('source')) == 1 and len(root.find('source').findtext('database')) > 0, "XML file does not contain 'source' element with a 'database'" | |
assert len(root.find('size')) == 3, "XML file doesn not contain 'size' element" | |
assert root.find('size').find('width').text and root.find('size').find('height').text and root.find('size').find('depth').text, "XML file does not contain either 'width', 'height', or 'depth' element" | |
assert root.find('segmented').text == '0' or len(root.find('segmented')) > 0, "'segmented' element is neither 0 or a list" | |
assert len(root.findall('object')) > 0, "XML file contains no 'object' element" # Check if the root contains zero or more 'objects' | |
required_objects = ['name', 'pose', 'truncated', 'difficult', 'bndbox'] # All possible meta-data about an object | |
for obj in root.findall('object'): | |
assert len(obj.findtext(required_objects[0])) > 0, "Object does not contain a parameter 'name'" | |
assert len(obj.findtext(required_objects[1])) > 0, "Object does not contain a parameter 'pose'" | |
assert int(obj.findtext(required_objects[2])) in [0, 1], "Object does not contain a parameter 'truncated'" | |
assert int(obj.findtext(required_objects[3])) in [0, 1], "Object does not contain a parameter 'difficult'" | |
assert len(obj.findall(required_objects[4])) > 0, "Object does not contain a parameter 'bndbox'" | |
for bbox in obj.findall(required_objects[4]): | |
assert int(bbox.findtext('xmin')) > 0, "'xmin' value for the bounding box is missing " | |
assert int(bbox.findtext('ymin')) > 0, "'ymin' value for the bounding box is missing " | |
assert int(bbox.findtext('xmax')) > 0, "'xmax' value for the bounding box is missing " | |
assert int(bbox.findtext('ymax')) > 0, "'ymax' value for the bounding box is missing " | |
print('The dataset format is PASCAL VOC!') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment