AlexanderAA/tagchecker.py

## tagchecker.py
import re


# Events - start tag, end tag
START = 'start'
END = 'end'


def regex_lexer(markup, rgx_tag="<(?P<event>/?)(?P<tag>[A-Z])>"):
    """ Regex-based lexer.

    Usage:
    >>> list(regex_lexer(""))
    []

    >>> list(regex_lexer("<A></B>"))
    [('start', 'A'), ('end', 'B')]

    :param markup: Input text
    :type markup: str
    :param rgx_tag: Regular expression to match a single tag.
                    Must contain named capturing groups "event" and "tag".
    :type rgx_tag: str
    :return: [(event, tag)]
    :rtype: [(str, str)]
    """
    matches = re.finditer(rgx_tag, markup)
    for match in matches:
        tag = match.group('tag')
        event = END if match.group('event') == '/' else START
        yield event, tag


def tag_checker(markup, lexer=regex_lexer):
    """ Check markup tags correctness, return success or error message

    :param markup: Input text
    :type markup: str
    :param lexer: Lexer callable
    :return: Error or Success message
    :rtype: str

    Usage:
    >>> tag_checker("")
    'Correctly tagged paragraph'

    >>> tag_checker(" ")
    'Correctly tagged paragraph'

    >>> tag_checker("<A><B></B></A>")
    'Correctly tagged paragraph'

    >>> tag_checker("<A><B></B>")
    'Expected </A> found #'

    >>> tag_checker("<A><B></A>")
    'Expected </B> found </A>'

    >>> tag_checker("The following text<C><B>is centred and in boldface</B></C>")
    'Correctly tagged paragraph'

    >>> tag_checker("<B>This <\g>is <B>boldface</B> in <<*> a</B> <\6> <<d>sentence")
    'Correctly tagged paragraph'

    >>> tag_checker("<B><C> This should be centred and in boldface, but the tags are wrongly nested </B></C>")
    'Expected </C> found </B>'

    >>> tag_checker("<B>This should be in boldface, but there is an extra closing tag</B></C>")
    'Expected # found </C>'

    >>> tag_checker("<B><C>This should be centred and in boldface, but there is a missing closing tag</C>")
    'Expected </B> found #'

    """
    lexemes = lexer(markup)
    stack = []
    for (event, tag) in lexemes:
        if event == START:
            stack.append(tag)
        elif event == END:
            if len(stack) == 0:
                return "Expected # found </{}>".format(tag)
            expected_tag = stack.pop()
            if tag != expected_tag:
                return "Expected </{}> found </{}>".format(expected_tag, tag)
    if len(stack) > 0:
        expected_tag = stack.pop()
        return "Expected </{}> found #".format(expected_tag)
    return "Correctly tagged paragraph"


if __name__ == "__main__":
    import doctest
    doctest.testmod()
	import re


	# Events - start tag, end tag
	START = 'start'
	END = 'end'


	def regex_lexer(markup, rgx_tag="<(?P<event>/?)(?P<tag>[A-Z])>"):
	""" Regex-based lexer.

	Usage:
	>>> list(regex_lexer(""))
	[]

	>>> list(regex_lexer("<A></B>"))
	[('start', 'A'), ('end', 'B')]

	:param markup: Input text
	:type markup: str
	:param rgx_tag: Regular expression to match a single tag.
	Must contain named capturing groups "event" and "tag".
	:type rgx_tag: str
	:return: [(event, tag)]
	:rtype: [(str, str)]
	"""
	matches = re.finditer(rgx_tag, markup)
	for match in matches:
	tag = match.group('tag')
	event = END if match.group('event') == '/' else START
	yield event, tag


	def tag_checker(markup, lexer=regex_lexer):
	""" Check markup tags correctness, return success or error message

	:param markup: Input text
	:type markup: str
	:param lexer: Lexer callable
	:return: Error or Success message
	:rtype: str

	Usage:
	>>> tag_checker("")
	'Correctly tagged paragraph'

	>>> tag_checker(" ")
	'Correctly tagged paragraph'

	>>> tag_checker("<A><B></B></A>")
	'Correctly tagged paragraph'

	>>> tag_checker("<A><B></B>")
	'Expected </A> found #'

	>>> tag_checker("<A><B></A>")
	'Expected </B> found </A>'

	>>> tag_checker("The following text<C><B>is centred and in boldface</B></C>")
	'Correctly tagged paragraph'

	>>> tag_checker("<B>This <\g>is <B>boldface</B> in <<*> a</B> <\6> <<d>sentence")
	'Correctly tagged paragraph'

	>>> tag_checker("<B><C> This should be centred and in boldface, but the tags are wrongly nested </B></C>")
	'Expected </C> found </B>'

	>>> tag_checker("<B>This should be in boldface, but there is an extra closing tag</B></C>")
	'Expected # found </C>'

	>>> tag_checker("<B><C>This should be centred and in boldface, but there is a missing closing tag</C>")
	'Expected </B> found #'

	"""
	lexemes = lexer(markup)
	stack = []
	for (event, tag) in lexemes:
	if event == START:
	stack.append(tag)
	elif event == END:
	if len(stack) == 0:
	return "Expected # found </{}>".format(tag)
	expected_tag = stack.pop()
	if tag != expected_tag:
	return "Expected </{}> found </{}>".format(expected_tag, tag)
	if len(stack) > 0:
	expected_tag = stack.pop()
	return "Expected </{}> found #".format(expected_tag)
	return "Correctly tagged paragraph"


	if __name__ == "__main__":
	import doctest
	doctest.testmod()