|
import re |
|
|
|
#capture a word chararacter or a / inside a <> tag, followed by anything |
|
#except <, any number of times, non-greedy, plus any spaces around the tag |
|
STANDALONE_TAG_REGEX = re.compile(r'(\s*<(?:[A-Za-z]+|/)[^<]*?>\s*)') |
|
|
|
#a dictionary where keys are strings with fake xml tags in them, |
|
#and values are the ordered list of tags that the regex should capture |
|
REGEX_TESTS = { |
|
#wrapped |
|
'hello <a>world</a>!': [' <a>', '</a>'], |
|
'hello<a>world</a>': ['<a>', '</a>'], |
|
'<a>hello world</a>': ['<a>', '</a>'], |
|
'hello <a>world</a> out there': [' <a>', '</a> '], |
|
'hello <a>world</a>': [' <a>', '</a>'], |
|
'hello <a href="blah">world</a>': [' <a href="blah">', '</a>'], |
|
|
|
#nested |
|
'hello <a><b>world</b></a>': [' <a>', '<b>', '</b>', '</a>'], |
|
'hello <a><b href="blah">world</b></a>': [' <a>', '<b href="blah">', '</b>', '</a>'], |
|
'hello <a href="blah"><b>world</b></a>': [' <a href="blah">', '<b>', '</b>', '</a>'], |
|
'hello <a><a>world</a></a>': [' <a>', '<a>', '</a>', '</a>'], |
|
'<a><b>word1</b>word2</a> word3 word4': ['<a>', '<b>', '</b>', '</a> '], |
|
'hello <a><b><a>world</a></b></a>': [' <a>', '<b>', '<a>', '</a>', '</b>', '</a>'], |
|
'hello <a>world</a> hello <b>everyone</b>': [' <a>', '</a> ', ' <b>', '</b>'], |
|
'hello <a></a>world': [' <a>', '</a>'], |
|
'hello <a href="blah"><b href="blah">world</b></a>': |
|
[' <a href="blah">', '<b href="blah">', '</b>', '</a>'], |
|
|
|
#less than / greater than |
|
'hello <3 <a>world</a>': [' <a>', '</a>'], |
|
'hello <a>world</a> <3': [' <a>', '</a> '], |
|
'hello E> <a>world</a>': [' <a>', '</a>'], |
|
'hello <a>world</a> E>': [' <a>', '</a> '], |
|
'hello <3 <a> world E> </a>': [' <a> ', ' </a>'], |
|
'hello <a> <3 world </a> E>': [' <a> ', ' </a> '], |
|
'hello <a> world </a> E>': [' <a> ', ' </a> '], |
|
'hello <a> world E> </a>': [' <a> ', ' </a>'], |
|
'hello <a> <3 world </a>': [' <a> ', ' </a>'], |
|
'hello <a> <3 world E> </a>': [' <a> ', ' </a>'], |
|
|
|
#empty |
|
'hello <a><b></b></a>world': [' <a>', '<b>', '</b>', '</a>'], |
|
'hello <a><a></a></a>world': [' <a>', '<a>', '</a>', '</a>'], |
|
|
|
#self-closing |
|
'hello <br /> world': [' <br /> '], |
|
'hello <br /><a>world</a>': [' <br />', '<a>', '</a>'], |
|
'hello<br/>world': ['<br/>'], |
|
|
|
#realistic |
|
'<a>Shelf</a>life at <b>28</b><br />°C:<br />': |
|
['<a>', '</a>', ' <b>', '</b>', '<br />', '<br />'], |
|
'< 5 LEU/µL<x id=“locked568” xid=“lockTU_e1354968-d068-4cca-8203-7ef71227fad1"/> <x id=“1432”/>5-20 LEU/µl borderline': |
|
['<x id=“locked568” xid=“lockTU_e1354968-d068-4cca-8203-7ef71227fad1"/> ', '<x id=“1432”/>'], |
|
} |
|
|
|
if __name__ == '__main__': |
|
for i, s in enumerate(REGEX_TESTS): |
|
result = re.findall(STANDALONE_TAG_REGEX, s) |
|
answer = REGEX_TESTS[s] |
|
if result != answer: |
|
print('FAILURE.\nString: {}\nResult: {}\nExpected: {}\n'.format(s, result, answer)) |
|
else: |
|
print('Success.\nString: {}\nResult: {}\n'.format(s, result)) |