Skip to content

Instantly share code, notes, and snippets.

@AlexanderAA
Created September 16, 2018 21:29
Show Gist options
  • Save AlexanderAA/3e9b651cc636e162252fd58fd1e6ec4e to your computer and use it in GitHub Desktop.
Save AlexanderAA/3e9b651cc636e162252fd58fd1e6ec4e to your computer and use it in GitHub Desktop.
import re
# Events - start tag, end tag
START = 'start'
END = 'end'
def regex_lexer(markup, rgx_tag="<(?P<event>/?)(?P<tag>[A-Z])>"):
""" Regex-based lexer.
Usage:
>>> list(regex_lexer(""))
[]
>>> list(regex_lexer("<A></B>"))
[('start', 'A'), ('end', 'B')]
:param markup: Input text
:type markup: str
:param rgx_tag: Regular expression to match a single tag.
Must contain named capturing groups "event" and "tag".
:type rgx_tag: str
:return: [(event, tag)]
:rtype: [(str, str)]
"""
matches = re.finditer(rgx_tag, markup)
for match in matches:
tag = match.group('tag')
event = END if match.group('event') == '/' else START
yield event, tag
def tag_checker(markup, lexer=regex_lexer):
""" Check markup tags correctness, return success or error message
:param markup: Input text
:type markup: str
:param lexer: Lexer callable
:return: Error or Success message
:rtype: str
Usage:
>>> tag_checker("")
'Correctly tagged paragraph'
>>> tag_checker(" ")
'Correctly tagged paragraph'
>>> tag_checker("<A><B></B></A>")
'Correctly tagged paragraph'
>>> tag_checker("<A><B></B>")
'Expected </A> found #'
>>> tag_checker("<A><B></A>")
'Expected </B> found </A>'
>>> tag_checker("The following text<C><B>is centred and in boldface</B></C>")
'Correctly tagged paragraph'
>>> tag_checker("<B>This <\g>is <B>boldface</B> in <<*> a</B> <\6> <<d>sentence")
'Correctly tagged paragraph'
>>> tag_checker("<B><C> This should be centred and in boldface, but the tags are wrongly nested </B></C>")
'Expected </C> found </B>'
>>> tag_checker("<B>This should be in boldface, but there is an extra closing tag</B></C>")
'Expected # found </C>'
>>> tag_checker("<B><C>This should be centred and in boldface, but there is a missing closing tag</C>")
'Expected </B> found #'
"""
lexemes = lexer(markup)
stack = []
for (event, tag) in lexemes:
if event == START:
stack.append(tag)
elif event == END:
if len(stack) == 0:
return "Expected # found </{}>".format(tag)
expected_tag = stack.pop()
if tag != expected_tag:
return "Expected </{}> found </{}>".format(expected_tag, tag)
if len(stack) > 0:
expected_tag = stack.pop()
return "Expected </{}> found #".format(expected_tag)
return "Correctly tagged paragraph"
if __name__ == "__main__":
import doctest
doctest.testmod()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment