Skip to content

Instantly share code, notes, and snippets.

@mgd020
Last active December 2, 2019 05:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mgd020/854fab4c96a66ef63c5197528b017391 to your computer and use it in GitHub Desktop.
Save mgd020/854fab4c96a66ef63c5197528b017391 to your computer and use it in GitHub Desktop.
Parse an XML snippet with regular expressions only.
import re
from xml.sax.saxutils import unescape
tags_pattern = re.compile(r"</(?P<close>\S+)>|<(?P<open>[^>!?/\s]+)(\s+(?P<attrs>.*?))?(?P<empty>/)?\s*>|<!--(?P<comment>.*?)-->|(?P<cdata><!\[CDATA\[)")
attrs_pattern = re.compile(r"\s*(?P<name>\S+)\s*=\s*(?P<quote>\"|')(?P<value>.*?)(?P=quote)")
cdata_close_pattern = re.compile(r"\]\]>")
escaped_entities = {"&quot;": '"', "&apos;": "'"}
def xml_sax_parse_string(string):
start = 0
for match in iter(lambda: tags_pattern.search(string, start), None):
if start and match.start() != start:
yield "characters", unescape(string[start:match.start()], escaped_entities)
if match.group("cdata") is not None:
start = match.end()
match = cdata_close_pattern.search(string, start)
if match is None:
break
yield "characters", unescape(string[start:match.start()], escaped_entities)
elif match.group("open") is not None:
attrs = {
name: unescape(value, escaped_entities)
for name, _, value in attrs_pattern.findall(match.group("attrs") or "")
}
yield "startElement", match.group("open"), attrs
if match.group("empty"):
yield "endElement", match.group("open")
elif match.group("close") is not None:
yield "endElement", match.group("close")
start = match.end()
if start < len(string) - 1:
yield "characters", unescape(string[start:], escaped_entities)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment