Skip to content

Instantly share code, notes, and snippets.

@rsirres
Created January 7, 2018 16:22
Show Gist options
  • Save rsirres/fc49afc4851e4b3026a0537c4706a05d to your computer and use it in GitHub Desktop.
Save rsirres/fc49afc4851e4b3026a0537c4706a05d to your computer and use it in GitHub Desktop.
Python3 XML subdocument incremental processing
{'country.rank': ['1'], 'country.year': ['2008'], 'country.gdppc': ['141100'], 'country.neighbor.@name': ['Austria', 'Switzerland'], 'country.neighbor.@direction': ['E', 'W'], 'country.@name': ['Liechtenstein']}
{'country.rank': ['4'], 'country.year': ['2011'], 'country.gdppc': ['59900'], 'country.neighbor.@name': ['Malaysia'], 'country.neighbor.@direction': ['N'], 'country.@name': ['Singapore']}
{'country.rank': ['68'], 'country.year': ['2011'], 'country.gdppc': ['13600'], 'country.neighbor.@name': ['Costa Rica', 'Colombia'], 'country.neighbor.@direction': ['W', 'E'], 'country.@name': ['Panama']}
<?xml version="1.0"?>
<data>
<country name="Liechtenstein">
<rank>1</rank>
<year>2008</year>
<gdppc>141100</gdppc>
<neighbor name="Austria" direction="E"/>
<neighbor name="Switzerland" direction="W"/>
</country>
<country name="Singapore">
<rank>4</rank>
<year>2011</year>
<gdppc>59900</gdppc>
<neighbor name="Malaysia" direction="N"/>
</country>
<country name="Panama">
<rank>68</rank>
<year>2011</year>
<gdppc>13600</gdppc>
<neighbor name="Costa Rica" direction="W"/>
<neighbor name="Colombia" direction="E"/>
</country>
</data>
import xml.etree.ElementTree as ET
from collections import defaultdict
# Path to XML file
file_path = "/Users/Raphael/Downloads/sample4.xml"
# Result Container
result = []
# XMl Namespaces Mapping the Fully Qualified Namespace to its prefix
# h -> http://www.w3.org/HTML/1998/html4
namespaces = {}
# Path to current processed xml element
xml_path = []
# Flag indicating namespace removal
remove_namespaces = True
# Given a tag name we process the xml file as a collection of subdocuments
subdocument_tag = "country"
### Helper
def remove_namespace(tag):
if "}" in tag:
tag = tag.split("}")[1]
return tag
def shorten_namespace(tag):
for k, v in namespaces.items():
pass
def process_xml():
subresult = None
for event, elem in ET.iterparse(file_path, events=("start-ns", "start", "end")):
if event == 'start':
tag = elem.tag
if tag.endswith(subdocument_tag):
subresult = defaultdict(list)
if remove_namespaces:
tag = remove_namespace(elem.tag)
# Starting tracking the path from subdocument tag
if tag.endswith(subdocument_tag) or subdocument_tag in xml_path:
xml_path.append(tag)
elif event == 'end':
# process the tag
if elem.tag.endswith(subdocument_tag):
result.append( subresult )
#continue
if elem.text and elem.text.strip():
key = ".".join(xml_path)
subresult[key].append(elem.text)
if elem.attrib:
prefix = ".".join(xml_path)
for attribute, value in elem.attrib.items():
key = prefix + ".@" + attribute
subresult[key].append(value)
# Starting tracking the path from subdocument tag
if tag.endswith(subdocument_tag) or subdocument_tag in xml_path:
xml_path.pop()
# Delete SubDocument when processing is done
if subdocument_tag and elem.tag.endswith(subdocument_tag):
elem.clear()
elif event == "start-ns":
namespaces[ "{" + elem[1] + "}"] = elem[0]
process_xml()
# Print Namespaces
for k, v in namespaces.items():
print(k, " ", v)
# Print Result
for item in result:
print( dict(item) )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment