Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Robust XML parser for duplicated attributes.
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Exmaple Robust XML parser which is capable of parsing duplicated attributes using html.parser.HTMLParser
Author: Tanimodori
License: Public Domain
Reference: https://stackoverflow.com/questions/31909929/webscraping-with-python3-ignoring-duplicate-attribute-errors
"""
import re
import sys
import html.parser as parser_module
from html.parser import HTMLParser
# Workarounds for malformed tag <_Transform ...>
# Adds a '_' inside '[a-zA-Z]'
parser_module.starttagopen = re.compile('<[_a-zA-Z]')
parser_module.tagfind_tolerant = re.compile(
r'([_a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
parser_module.locatestarttagend_tolerant = re.compile(r"""
<[_a-zA-Z][^\t\n\r\f />\x00]* # tag name
(?:[\s/]* # optional whitespace before attribute name
(?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
(?:\s*=+\s* # value indicator
(?:'[^']*' # LITA-enclosed value
|"[^"]*" # LIT-enclosed value
|(?!['"])[^>\s]* # bare value
)
(?:\s*,)* # possibly followed by a comma
)?(?:\s|/(?!>))*
)*
)?
\s* # trailing whitespace
""", re.VERBOSE)
endtagfind = re.compile(r'</\s*([_a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
class RobustXMLList():
def __init__(self, tag=None, attrs=None):
super(RobustXMLList, self).__init__()
self.tag = tag
self.attrs = {}
if attrs:
for (k, v) in attrs:
self.attrs[k] = v
self.items = []
def __str__(self):
return str(self.__dict__)
def __repr__(self):
return self.__str__()
class RobustXMLParser(HTMLParser):
def __init__(self, **kwargs):
self._root = RobustXMLList('_root')
self._stack = [self._root]
super(RobustXMLParser, self).__init__(**kwargs)
def handle_starttag(self, tag, attrs):
item = RobustXMLList(tag, attrs)
self._stack[-1].items.append(item)
self._stack.append(item)
def handle_endtag(self, tag):
self._stack.pop()
def handle_data(self, data):
if data.strip(" \t\n"):
self._stack[-1].items.extend(data)
def close(self):
return self._root
@classmethod
def parse(cls, document):
parser = cls()
# Workaround for <!-- ... --> in attribute list
parser.feed(re.sub("<!--.*?-->", '', document, flags=re.DOTALL))
return parser.close()
if __name__ == "__main__":
with open(sys.argv[1], "r") as f:
# Parse XML to XMLList
XMLList = RobustXMLParser.parse(f.read())
# convert to Python dict
XMLDict = XMLList.__dict__
# string output
print(XMLList)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment