nathanntg/html_regular_expressions.py

## html_regular_expressions.py
import re

# match all script blocks
r = re.compile('<script[^>]*?>.*?</script>', re.IGNORECASE | re.DOTALL)
# can be used to easily remove script tags
html_without_scripts = r.sub('', html)

# match all style blocks
r = re.compile('<style[^>]*?>.*?</style>', re.IGNORECASE | re.DOTALL)
# can be used to easily remove script tags
html_without_styles = r.sub('', html)

# remove all comments
def remove_comments(html):
    """
    Removes all comments from an HTML string. It is somewhat complex and not super elegant, but it handles
    many of the corner cases introduces by weird internet explorer conditional comments.
    """
    comm_start = html.find('<!')
    if -1 == comm_start:
        return html

    # start building return string
    ret = html[0:comm_start]

    max_l = len(html)
    comm_end = 0
    while -1 != comm_start:
        # add content since last comment ended
        if 0 < comm_end:
            ret += html[comm_end:comm_start]

        # starts with "<!-"
        if (comm_start + 2) < max_l and '-' == html[comm_start + 2]:
            comm_end = html.find('->', comm_start)
            if 0 < comm_end:
                comm_end += 2
        else:
            comm_end = -1

        # unclear ending, count opening and closing carets
        if -1 == comm_end:
            c = 0
            i = comm_start + 1
            while i < max_l:
                if '>' == html[i]:
                    if 0 < c:
                        c -= 1
                    else:
                        comm_end = i + 1
                        break
                elif '<' == html[i]:
                    c += 1
                i += 1
            if 0 < c or i == max_l:
                return ret

        comm_start = html.find('<!', comm_end)

    if 0 < comm_end:
        ret += html[comm_end:]

    return ret
# can be used to easily remove html comments
html_without_comments = remove_comments(html)


def remove_quotes(s):
    if '"' == s[0] or '\'' == s[0]:
        return s[1:-1]
    return s


def get_tags(html, tag, inner=False):
    """
    A light weight tool for finding all occurrences of the specified tag and returning attributes. If inner is true,
    then it also returns the contents of the tag as entry "=" in the dictionary. Returns a list of dictionaries for
    each tag found.
    """
    r_tag = re.compile('<' + tag + '(|\\s((?:"[^"]*"|\'[^\']*\'|[^"\'/>]*)*?))' +
                       ('(?:/>|>(.*?)(?:</' + tag + '\\s*>|(?=<' + tag + ')))' if inner else '/?>'), re.IGNORECASE)
    r_attr = re.compile('\\b(\\w+)\\s*?=\\s*?("[^"]*?"|\'[^\']*?\'|[^\'">\\s]+)')
    tags = r_tag.findall(html)
    ret = []
    for t in tags:
        entry = {}
        if inner:
            entry['='] = t[2]
        attributes = r_attr.findall(t[1])
        for a in attributes:
            entry[a[0].lower()] = remove_quotes(a[1])
        ret.append(entry)
    return ret
	import re

	# match all script blocks
	r = re.compile('<script[^>]?>.?</script>', re.IGNORECASE \| re.DOTALL)
	# can be used to easily remove script tags
	html_without_scripts = r.sub('', html)

	# match all style blocks
	r = re.compile('<style[^>]?>.?</style>', re.IGNORECASE \| re.DOTALL)
	# can be used to easily remove script tags
	html_without_styles = r.sub('', html)

	# remove all comments
	def remove_comments(html):
	"""
	Removes all comments from an HTML string. It is somewhat complex and not super elegant, but it handles
	many of the corner cases introduces by weird internet explorer conditional comments.
	"""
	comm_start = html.find('<!')
	if -1 == comm_start:
	return html

	# start building return string
	ret = html[0:comm_start]

	max_l = len(html)
	comm_end = 0
	while -1 != comm_start:
	# add content since last comment ended
	if 0 < comm_end:
	ret += html[comm_end:comm_start]

	# starts with "<!-"
	if (comm_start + 2) < max_l and '-' == html[comm_start + 2]:
	comm_end = html.find('->', comm_start)
	if 0 < comm_end:
	comm_end += 2
	else:
	comm_end = -1

	# unclear ending, count opening and closing carets
	if -1 == comm_end:
	c = 0
	i = comm_start + 1
	while i < max_l:
	if '>' == html[i]:
	if 0 < c:
	c -= 1
	else:
	comm_end = i + 1
	break
	elif '<' == html[i]:
	c += 1
	i += 1
	if 0 < c or i == max_l:
	return ret

	comm_start = html.find('<!', comm_end)

	if 0 < comm_end:
	ret += html[comm_end:]

	return ret
	# can be used to easily remove html comments
	html_without_comments = remove_comments(html)


	def remove_quotes(s):
	if '"' == s[0] or '\'' == s[0]:
	return s[1:-1]
	return s


	def get_tags(html, tag, inner=False):
	"""
	A light weight tool for finding all occurrences of the specified tag and returning attributes. If inner is true,
	then it also returns the contents of the tag as entry "=" in the dictionary. Returns a list of dictionaries for
	each tag found.
	"""
	r_tag = re.compile('<' + tag + '(\|\\s((?:"[^"]"\|\'[^\']\'\|[^"\'/>])?))' +
	('(?:/>\|>(.?)(?:</' + tag + '\\s>\|(?=<' + tag + ')))' if inner else '/?>'), re.IGNORECASE)
	r_attr = re.compile('\\b(\\w+)\\s?=\\s?("[^"]?"\|\'[^\']?\'\|[^\'">\\s]+)')
	tags = r_tag.findall(html)
	ret = []
	for t in tags:
	entry = {}
	if inner:
	entry['='] = t[2]
	attributes = r_attr.findall(t[1])
	for a in attributes:
	entry[a[0].lower()] = remove_quotes(a[1])
	ret.append(entry)
	return ret