Skip to content

Instantly share code, notes, and snippets.

@rsirres
Created January 7, 2018 15:44
Show Gist options
  • Save rsirres/f600e1bd698a7c80693f61bd278e7340 to your computer and use it in GitHub Desktop.
Save rsirres/f600e1bd698a7c80693f61bd278e7340 to your computer and use it in GitHub Desktop.
Python3 Script (without dependencies) to parse XML File incrementally and transform it to key-value pairs
<?xml version="1.0"?>
<data>
<country name="Liechtenstein">
<rank>1</rank>
<year>2008</year>
<gdppc>141100</gdppc>
<neighbor name="Austria" direction="E"/>
<neighbor name="Switzerland" direction="W"/>
</country>
<country name="Singapore">
<rank>4</rank>
<year>2011</year>
<gdppc>59900</gdppc>
<neighbor name="Malaysia" direction="N"/>
</country>
<country name="Panama">
<rank>68</rank>
<year>2011</year>
<gdppc>13600</gdppc>
<neighbor name="Costa Rica" direction="W"/>
<neighbor name="Colombia" direction="E"/>
</country>
</data>
import xml.etree.ElementTree as ET
from collections import defaultdict
# Path to XML file
file_path = "/Users/Raphael/Downloads/sample4.xml"
# Result Container
result = defaultdict(list)
# XMl Namespaces Mapping the Fully Qualified Namespace to its prefix
# h -> http://www.w3.org/HTML/1998/html4
namespaces = {}
# Path to current processed xml element
xml_path = []
# Flag
remove_namespaces = True
### Helper
def remove_namespace(tag):
if "}" in tag:
tag = tag.split("}")[1]
return tag
def shorten_namespace(tag):
for k, v in namespaces.items():
pass
for event, elem in ET.iterparse(file_path, events=("start-ns", "start", "end")):
if event == 'start':
tag = elem.tag
if remove_namespaces:
tag = remove_namespace(elem.tag)
xml_path.append(tag)
elif event == 'end':
# process the tag
if elem.text and elem.text.strip():
key = ".".join(xml_path)
result[key].append(elem.text)
if elem.attrib:
prefix = ".".join(xml_path)
for attribute, value in elem.attrib.items():
key = prefix + ".@" + attribute
result[key].append(value)
xml_path.pop()
elem.clear()
elif event == "start-ns":
namespaces[ "{" + elem[1] + "}"] = elem[0]
for k, v in namespaces.items():
print(k, " ", v)
for k, v in result.items():
print(k, " ", v)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment