rsirres/output.txt

## output.txt
{'country.rank': ['1'], 'country.year': ['2008'], 'country.gdppc': ['141100'], 'country.neighbor.@name': ['Austria', 'Switzerland'], 'country.neighbor.@direction': ['E', 'W'], 'country.@name': ['Liechtenstein']}
{'country.rank': ['4'], 'country.year': ['2011'], 'country.gdppc': ['59900'], 'country.neighbor.@name': ['Malaysia'], 'country.neighbor.@direction': ['N'], 'country.@name': ['Singapore']}
{'country.rank': ['68'], 'country.year': ['2011'], 'country.gdppc': ['13600'], 'country.neighbor.@name': ['Costa Rica', 'Colombia'], 'country.neighbor.@direction': ['W', 'E'], 'country.@name': ['Panama']}

## sample4.xml
<?xml version="1.0"?>
<data>
    <country name="Liechtenstein">
        <rank>1</rank>
        <year>2008</year>
        <gdppc>141100</gdppc>
        <neighbor name="Austria" direction="E"/>
        <neighbor name="Switzerland" direction="W"/>
    </country>
    <country name="Singapore">
        <rank>4</rank>
        <year>2011</year>
        <gdppc>59900</gdppc>
        <neighbor name="Malaysia" direction="N"/>
    </country>
    <country name="Panama">
        <rank>68</rank>
        <year>2011</year>
        <gdppc>13600</gdppc>
        <neighbor name="Costa Rica" direction="W"/>
        <neighbor name="Colombia" direction="E"/>
    </country>
</data>

## xml_subdocument_processing.py
import xml.etree.ElementTree as ET
from collections import defaultdict

# Path to XML file
file_path = "/Users/Raphael/Downloads/sample4.xml"

# Result Container
result = []

# XMl Namespaces Mapping the Fully Qualified Namespace to its prefix
# h ->   http://www.w3.org/HTML/1998/html4
namespaces = {}

# Path to current processed xml element
xml_path   = []

# Flag indicating namespace removal
remove_namespaces = True

# Given a tag name we process the xml file as a collection of subdocuments
subdocument_tag = "country"


### Helper

def remove_namespace(tag):
    if "}" in tag:
        tag = tag.split("}")[1]

    return tag


def shorten_namespace(tag):
    for k, v in namespaces.items():
        pass

def process_xml():
    subresult = None

    for event, elem in ET.iterparse(file_path, events=("start-ns", "start", "end")):
        if event == 'start':
            tag = elem.tag
            if tag.endswith(subdocument_tag):
                subresult = defaultdict(list)

            if remove_namespaces:
                tag = remove_namespace(elem.tag)

            # Starting tracking the path from subdocument tag
            if tag.endswith(subdocument_tag) or subdocument_tag in xml_path:
                xml_path.append(tag)

        elif event == 'end':
            # process the tag
            if elem.tag.endswith(subdocument_tag):
                result.append( subresult )
                #continue

            if elem.text and elem.text.strip():
                key = ".".join(xml_path)
                subresult[key].append(elem.text)

            if elem.attrib:
                prefix = ".".join(xml_path)
                for attribute, value in elem.attrib.items():
                    key = prefix + ".@" + attribute
                    subresult[key].append(value)


            # Starting tracking the path from subdocument tag
            if tag.endswith(subdocument_tag) or subdocument_tag in xml_path:
                xml_path.pop()

            # Delete SubDocument when processing is done
            if subdocument_tag and elem.tag.endswith(subdocument_tag):
                elem.clear()

        elif event == "start-ns":
            namespaces[ "{" + elem[1] + "}"] = elem[0]


process_xml()

# Print Namespaces
for k, v in namespaces.items():
    print(k, " ", v)

# Print Result
for item in result:
    print( dict(item) )
	{'country.rank': ['1'], 'country.year': ['2008'], 'country.gdppc': ['141100'], 'country.neighbor.@name': ['Austria', 'Switzerland'], 'country.neighbor.@direction': ['E', 'W'], 'country.@name': ['Liechtenstein']}
	{'country.rank': ['4'], 'country.year': ['2011'], 'country.gdppc': ['59900'], 'country.neighbor.@name': ['Malaysia'], 'country.neighbor.@direction': ['N'], 'country.@name': ['Singapore']}
	{'country.rank': ['68'], 'country.year': ['2011'], 'country.gdppc': ['13600'], 'country.neighbor.@name': ['Costa Rica', 'Colombia'], 'country.neighbor.@direction': ['W', 'E'], 'country.@name': ['Panama']}
	<?xml version="1.0"?>
	<data>
	<country name="Liechtenstein">
	<rank>1</rank>
	<year>2008</year>
	<gdppc>141100</gdppc>
	<neighbor name="Austria" direction="E"/>
	<neighbor name="Switzerland" direction="W"/>
	</country>
	<country name="Singapore">
	<rank>4</rank>
	<year>2011</year>
	<gdppc>59900</gdppc>
	<neighbor name="Malaysia" direction="N"/>
	</country>
	<country name="Panama">
	<rank>68</rank>
	<year>2011</year>
	<gdppc>13600</gdppc>
	<neighbor name="Costa Rica" direction="W"/>
	<neighbor name="Colombia" direction="E"/>
	</country>
	</data>
	import xml.etree.ElementTree as ET
	from collections import defaultdict

	# Path to XML file
	file_path = "/Users/Raphael/Downloads/sample4.xml"

	# Result Container
	result = []

	# XMl Namespaces Mapping the Fully Qualified Namespace to its prefix
	# h -> http://www.w3.org/HTML/1998/html4
	namespaces = {}

	# Path to current processed xml element
	xml_path = []

	# Flag indicating namespace removal
	remove_namespaces = True

	# Given a tag name we process the xml file as a collection of subdocuments
	subdocument_tag = "country"




	### Helper

	def remove_namespace(tag):
	if "}" in tag:
	tag = tag.split("}")[1]

	return tag


	def shorten_namespace(tag):
	for k, v in namespaces.items():
	pass

	def process_xml():
	subresult = None

	for event, elem in ET.iterparse(file_path, events=("start-ns", "start", "end")):
	if event == 'start':
	tag = elem.tag
	if tag.endswith(subdocument_tag):
	subresult = defaultdict(list)

	if remove_namespaces:
	tag = remove_namespace(elem.tag)

	# Starting tracking the path from subdocument tag
	if tag.endswith(subdocument_tag) or subdocument_tag in xml_path:
	xml_path.append(tag)

	elif event == 'end':
	# process the tag
	if elem.tag.endswith(subdocument_tag):
	result.append( subresult )
	#continue

	if elem.text and elem.text.strip():
	key = ".".join(xml_path)
	subresult[key].append(elem.text)

	if elem.attrib:
	prefix = ".".join(xml_path)
	for attribute, value in elem.attrib.items():
	key = prefix + ".@" + attribute
	subresult[key].append(value)


	# Starting tracking the path from subdocument tag
	if tag.endswith(subdocument_tag) or subdocument_tag in xml_path:
	xml_path.pop()

	# Delete SubDocument when processing is done
	if subdocument_tag and elem.tag.endswith(subdocument_tag):
	elem.clear()

	elif event == "start-ns":
	namespaces[ "{" + elem[1] + "}"] = elem[0]


	process_xml()

	# Print Namespaces
	for k, v in namespaces.items():
	print(k, " ", v)

	# Print Result
	for item in result:
	print( dict(item) )