jvanasco/wellformed_parser.py

## wellformed_parser.py
def extract_tag_inner(tag):
    """
    extracts the inner part of a tag - dropping brackets, tagname, trailing slash

    :arg string tag: a html tag of the formats:
        <TAG_NAME TAG_ATTRIBUTES>
        <TAG_NAME TAG_ATTRIBUTES/>
        <TAG_NAME TAG_ATTRIBUTES />

    :returns string: the inner part of a tag
    """
    # remove the tag brackets
    if (tag[0] != '<') or (tag[-1] != '>'):
        raise ValueError("invalid tag")
    tag = tag[1:-1].strip()
    if tag[-1] == '/':
        tag = tag[1:-1].strip()

    # remove the tag name
    try:
        tag = tag.split(' ', 1)[1]
    except IndexError:
        # this happens on a tag without attributes. e.g. "<img/>"
        tag = tag
    return tag


def tag_inner_parser(txt):
    """
    this parser expects/requires WELL FORMED html attributes:

    :arg boolean txt: Text to be parsed. This should be cleaned

    :returns dict: a dict of values for the attributes.
                   if an attribute does not have a value (i.e. it is defined by it's mere presence) the value will be `True`
                   otherwise, values will always be a string

    The text should be cleaned beforehand to not contain trailing whitespace or slash

    See ``extract_tag_inner`` for a function to extract the inner from a full tag.
    """
    kw = {}  # kwargs dict
    n_k = False  # iN_Key
    n_v = False  # iN_Value
    k = None  # active Key being parsed
    v = None  # active Value being parsed
    q = None  # quote
    _qs = ('"', "'", )  # doing this is faring better than a direct comparison
    for c in txt:  # Char in text
        if n_k:
            # parsing a KEY
            if c == ' ':
                #  SPACE means we're on an attribute that does not have a value.
                # encode this as TRUE
                kw[k] = True
                # RESET
                n_k = False
                # no need to set reset other vars, the next loop will handle it
            elif c == '=':
                # `=` means we're ending the KEY and going to search a value
                # this is a SWITCH
                n_k = False
                n_v = True
                v = None
                q = None
            else:
                # assume this char is valid and keep building the KEY
                k += c
        elif n_v:
            # parsing a VALUE
            if v is None:
                # we just got here!
                # this parser requires VALUE attributes to be single or double quoted
                if c not in _qs:  # cPython is benching faster with this as a var
                    raise ValueError("invalid quote")
                # note the quote character. it will be used to check for escaped quotes
                q = c
                v = ''
            else:
                # the value is currently being built
                if c == q:
                    # we hit a quote character. is it escaped?
                    if v[-1] == '\\':
                        # this is escaped, treat it as normal
                        v += c
                    else:
                        # FINI!
                        if q == "'":
                            # escape quoted chars
                            v = v.replace("\\'", "'")
                        # SET
                        kw[k] = v
                        # RESET
                        k = None
                        v = None
                        n_k = False
                        n_v = False
                        q = None
                else:
                    # normal character, build out key
                    v += c
        else:  # not n_k and not n_v:
            if c == ' ':
                continue
            n_k = True
            k = c
    if k:
        # if we're here without clearing a value, then we ended on an attribute that does not have a value
        kw[k] = True
    return kw


def tag_inner_parser_unknown(txt, cleaned=True):
    """
    convenience function
    if you have a tag's internals that were not generated via ``extract_tag_inner``, this will try to clean the internals before processing

    :param boolean cleaned: Default ``True``. If invoked with ``False``, will perform the following cleaning

        if not cleaned:
          txt = (txt[:-1] if txt[-1] == '/' else txt).strip()

    :returns dict: a dict of values via ``tag_inner_parser```
    """
    if not cleaned:
        if not txt:
            return {}
        txt = (txt[:-1] if txt[-1] == '/' else txt).strip()
    return tag_inner_parser(txt)


def tag_parser(tag):
    """
    convenience function
    if you have a tag, this will extract the tag's attributes and then run them through the parser

    :param string tag: a html tag
    :returns dict: a dict of values via ``tag_inner_parser```
    """
    txt = extract_tag_inner(tag)
    return tag_inner_parser(txt)
	def extract_tag_inner(tag):
	"""
	extracts the inner part of a tag - dropping brackets, tagname, trailing slash

	:arg string tag: a html tag of the formats:
	<TAG_NAME TAG_ATTRIBUTES>
	<TAG_NAME TAG_ATTRIBUTES/>
	<TAG_NAME TAG_ATTRIBUTES />

	:returns string: the inner part of a tag
	"""
	# remove the tag brackets
	if (tag[0] != '<') or (tag[-1] != '>'):
	raise ValueError("invalid tag")
	tag = tag[1:-1].strip()
	if tag[-1] == '/':
	tag = tag[1:-1].strip()

	# remove the tag name
	try:
	tag = tag.split(' ', 1)[1]
	except IndexError:
	# this happens on a tag without attributes. e.g. "<img/>"
	tag = tag
	return tag


	def tag_inner_parser(txt):
	"""
	this parser expects/requires WELL FORMED html attributes:

	:arg boolean txt: Text to be parsed. This should be cleaned

	:returns dict: a dict of values for the attributes.
	if an attribute does not have a value (i.e. it is defined by it's mere presence) the value will be `True`
	otherwise, values will always be a string

	The text should be cleaned beforehand to not contain trailing whitespace or slash

	See ``extract_tag_inner`` for a function to extract the inner from a full tag.
	"""
	kw = {} # kwargs dict
	n_k = False # iN_Key
	n_v = False # iN_Value
	k = None # active Key being parsed
	v = None # active Value being parsed
	q = None # quote
	_qs = ('"', "'", ) # doing this is faring better than a direct comparison
	for c in txt: # Char in text
	if n_k:
	# parsing a KEY
	if c == ' ':
	# SPACE means we're on an attribute that does not have a value.
	# encode this as TRUE
	kw[k] = True
	# RESET
	n_k = False
	# no need to set reset other vars, the next loop will handle it
	elif c == '=':
	# `=` means we're ending the KEY and going to search a value
	# this is a SWITCH
	n_k = False
	n_v = True
	v = None
	q = None
	else:
	# assume this char is valid and keep building the KEY
	k += c
	elif n_v:
	# parsing a VALUE
	if v is None:
	# we just got here!
	# this parser requires VALUE attributes to be single or double quoted
	if c not in _qs: # cPython is benching faster with this as a var
	raise ValueError("invalid quote")
	# note the quote character. it will be used to check for escaped quotes
	q = c
	v = ''
	else:
	# the value is currently being built
	if c == q:
	# we hit a quote character. is it escaped?
	if v[-1] == '\\':
	# this is escaped, treat it as normal
	v += c
	else:
	# FINI!
	if q == "'":
	# escape quoted chars
	v = v.replace("\\'", "'")
	# SET
	kw[k] = v
	# RESET
	k = None
	v = None
	n_k = False
	n_v = False
	q = None
	else:
	# normal character, build out key
	v += c
	else: # not n_k and not n_v:
	if c == ' ':
	continue
	n_k = True
	k = c
	if k:
	# if we're here without clearing a value, then we ended on an attribute that does not have a value
	kw[k] = True
	return kw


	def tag_inner_parser_unknown(txt, cleaned=True):
	"""
	convenience function
	if you have a tag's internals that were not generated via ``extract_tag_inner``, this will try to clean the internals before processing

	:param boolean cleaned: Default ``True``. If invoked with ``False``, will perform the following cleaning

	if not cleaned:
	txt = (txt[:-1] if txt[-1] == '/' else txt).strip()

	:returns dict: a dict of values via ``tag_inner_parser```
	"""
	if not cleaned:
	if not txt:
	return {}
	txt = (txt[:-1] if txt[-1] == '/' else txt).strip()
	return tag_inner_parser(txt)


	def tag_parser(tag):
	"""
	convenience function
	if you have a tag, this will extract the tag's attributes and then run them through the parser

	:param string tag: a html tag
	:returns dict: a dict of values via ``tag_inner_parser```
	"""
	txt = extract_tag_inner(tag)
	return tag_inner_parser(txt)