Skip to content

Instantly share code, notes, and snippets.

@kaleidoescape
Last active March 19, 2021 13:57
Show Gist options
  • Save kaleidoescape/524f6f53a4562eaf6d8f1463f4d54670 to your computer and use it in GitHub Desktop.
Save kaleidoescape/524f6f53a4562eaf6d8f1463f4d54670 to your computer and use it in GitHub Desktop.
A regex to capture XML tags from a string. Useful for those times when accessing the DOM or using a full XML parser is not feasible (e.g. dirty scraped web data).
import re
#capture a word chararacter or a / inside a <> tag, followed by anything
#except <, any number of times, non-greedy, plus any spaces around the tag
STANDALONE_TAG_REGEX = re.compile(r'(\s*<(?:[A-Za-z]+|/)[^<]*?>\s*)')
#a dictionary where keys are strings with fake xml tags in them,
#and values are the ordered list of tags that the regex should capture
REGEX_TESTS = {
#wrapped
'hello <a>world</a>!': [' <a>', '</a>'],
'hello<a>world</a>': ['<a>', '</a>'],
'<a>hello world</a>': ['<a>', '</a>'],
'hello <a>world</a> out there': [' <a>', '</a> '],
'hello <a>world</a>': [' <a>', '</a>'],
'hello <a href="blah">world</a>': [' <a href="blah">', '</a>'],
#nested
'hello <a><b>world</b></a>': [' <a>', '<b>', '</b>', '</a>'],
'hello <a><b href="blah">world</b></a>': [' <a>', '<b href="blah">', '</b>', '</a>'],
'hello <a href="blah"><b>world</b></a>': [' <a href="blah">', '<b>', '</b>', '</a>'],
'hello <a><a>world</a></a>': [' <a>', '<a>', '</a>', '</a>'],
'<a><b>word1</b>word2</a> word3 word4': ['<a>', '<b>', '</b>', '</a> '],
'hello <a><b><a>world</a></b></a>': [' <a>', '<b>', '<a>', '</a>', '</b>', '</a>'],
'hello <a>world</a> hello <b>everyone</b>': [' <a>', '</a> ', ' <b>', '</b>'],
'hello <a></a>world': [' <a>', '</a>'],
'hello <a href="blah"><b href="blah">world</b></a>':
[' <a href="blah">', '<b href="blah">', '</b>', '</a>'],
#less than / greater than
'hello <3 <a>world</a>': [' <a>', '</a>'],
'hello <a>world</a> <3': [' <a>', '</a> '],
'hello E> <a>world</a>': [' <a>', '</a>'],
'hello <a>world</a> E>': [' <a>', '</a> '],
'hello <3 <a> world E> </a>': [' <a> ', ' </a>'],
'hello <a> <3 world </a> E>': [' <a> ', ' </a> '],
'hello <a> world </a> E>': [' <a> ', ' </a> '],
'hello <a> world E> </a>': [' <a> ', ' </a>'],
'hello <a> <3 world </a>': [' <a> ', ' </a>'],
'hello <a> <3 world E> </a>': [' <a> ', ' </a>'],
#empty
'hello <a><b></b></a>world': [' <a>', '<b>', '</b>', '</a>'],
'hello <a><a></a></a>world': [' <a>', '<a>', '</a>', '</a>'],
#self-closing
'hello <br /> world': [' <br /> '],
'hello <br /><a>world</a>': [' <br />', '<a>', '</a>'],
'hello<br/>world': ['<br/>'],
#realistic
'<a>Shelf</a>life at <b>28</b><br />°C:<br />':
['<a>', '</a>', ' <b>', '</b>', '<br />', '<br />'],
'< 5 LEU/µL<x id=“locked568” xid=“lockTU_e1354968-d068-4cca-8203-7ef71227fad1"/> <x id=“1432”/>5-20 LEU/µl borderline':
['<x id=“locked568” xid=“lockTU_e1354968-d068-4cca-8203-7ef71227fad1"/> ', '<x id=“1432”/>'],
}
if __name__ == '__main__':
for i, s in enumerate(REGEX_TESTS):
result = re.findall(STANDALONE_TAG_REGEX, s)
answer = REGEX_TESTS[s]
if result != answer:
print('FAILURE.\nString: {}\nResult: {}\nExpected: {}\n'.format(s, result, answer))
else:
print('Success.\nString: {}\nResult: {}\n'.format(s, result))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment