Last active
November 9, 2019 17:24
-
-
Save Tanimodori/c154d8c7325208e0020dfb9207a69914 to your computer and use it in GitHub Desktop.
Robust XML parser for duplicated attributes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
""" | |
Exmaple Robust XML parser which is capable of parsing duplicated attributes using html.parser.HTMLParser | |
Author: Tanimodori | |
License: Public Domain | |
Reference: https://stackoverflow.com/questions/31909929/webscraping-with-python3-ignoring-duplicate-attribute-errors | |
""" | |
import re | |
import sys | |
import html.parser as parser_module | |
from html.parser import HTMLParser | |
# Workarounds for malformed tag <_Transform ...> | |
# Adds a '_' inside '[a-zA-Z]' | |
parser_module.starttagopen = re.compile('<[_a-zA-Z]') | |
parser_module.tagfind_tolerant = re.compile( | |
r'([_a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*') | |
parser_module.locatestarttagend_tolerant = re.compile(r""" | |
<[_a-zA-Z][^\t\n\r\f />\x00]* # tag name | |
(?:[\s/]* # optional whitespace before attribute name | |
(?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name | |
(?:\s*=+\s* # value indicator | |
(?:'[^']*' # LITA-enclosed value | |
|"[^"]*" # LIT-enclosed value | |
|(?!['"])[^>\s]* # bare value | |
) | |
(?:\s*,)* # possibly followed by a comma | |
)?(?:\s|/(?!>))* | |
)* | |
)? | |
\s* # trailing whitespace | |
""", re.VERBOSE) | |
endtagfind = re.compile(r'</\s*([_a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') | |
class RobustXMLList(): | |
def __init__(self, tag=None, attrs=None): | |
super(RobustXMLList, self).__init__() | |
self.tag = tag | |
self.attrs = {} | |
if attrs: | |
for (k, v) in attrs: | |
self.attrs[k] = v | |
self.items = [] | |
def __str__(self): | |
return str(self.__dict__) | |
def __repr__(self): | |
return self.__str__() | |
class RobustXMLParser(HTMLParser): | |
def __init__(self, **kwargs): | |
self._root = RobustXMLList('_root') | |
self._stack = [self._root] | |
super(RobustXMLParser, self).__init__(**kwargs) | |
def handle_starttag(self, tag, attrs): | |
item = RobustXMLList(tag, attrs) | |
self._stack[-1].items.append(item) | |
self._stack.append(item) | |
def handle_endtag(self, tag): | |
self._stack.pop() | |
def handle_data(self, data): | |
if data.strip(" \t\n"): | |
self._stack[-1].items.extend(data) | |
def close(self): | |
return self._root | |
@classmethod | |
def parse(cls, document): | |
parser = cls() | |
# Workaround for <!-- ... --> in attribute list | |
parser.feed(re.sub("<!--.*?-->", '', document, flags=re.DOTALL)) | |
return parser.close() | |
if __name__ == "__main__": | |
with open(sys.argv[1], "r") as f: | |
# Parse XML to XMLList | |
XMLList = RobustXMLParser.parse(f.read()) | |
# convert to Python dict | |
XMLDict = XMLList.__dict__ | |
# string output | |
print(XMLList) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment