Skip to content

Instantly share code, notes, and snippets.

@nathanntg
Last active August 29, 2015 14:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nathanntg/5abfbba2c3f43637274b to your computer and use it in GitHub Desktop.
Save nathanntg/5abfbba2c3f43637274b to your computer and use it in GitHub Desktop.
HTML Regular Expressions
import re
# match all script blocks
r = re.compile('<script[^>]*?>.*?</script>', re.IGNORECASE | re.DOTALL)
# can be used to easily remove script tags
html_without_scripts = r.sub('', html)
# match all style blocks
r = re.compile('<style[^>]*?>.*?</style>', re.IGNORECASE | re.DOTALL)
# can be used to easily remove script tags
html_without_styles = r.sub('', html)
# remove all comments
def remove_comments(html):
"""
Removes all comments from an HTML string. It is somewhat complex and not super elegant, but it handles
many of the corner cases introduces by weird internet explorer conditional comments.
"""
comm_start = html.find('<!')
if -1 == comm_start:
return html
# start building return string
ret = html[0:comm_start]
max_l = len(html)
comm_end = 0
while -1 != comm_start:
# add content since last comment ended
if 0 < comm_end:
ret += html[comm_end:comm_start]
# starts with "<!-"
if (comm_start + 2) < max_l and '-' == html[comm_start + 2]:
comm_end = html.find('->', comm_start)
if 0 < comm_end:
comm_end += 2
else:
comm_end = -1
# unclear ending, count opening and closing carets
if -1 == comm_end:
c = 0
i = comm_start + 1
while i < max_l:
if '>' == html[i]:
if 0 < c:
c -= 1
else:
comm_end = i + 1
break
elif '<' == html[i]:
c += 1
i += 1
if 0 < c or i == max_l:
return ret
comm_start = html.find('<!', comm_end)
if 0 < comm_end:
ret += html[comm_end:]
return ret
# can be used to easily remove html comments
html_without_comments = remove_comments(html)
def remove_quotes(s):
if '"' == s[0] or '\'' == s[0]:
return s[1:-1]
return s
def get_tags(html, tag, inner=False):
"""
A light weight tool for finding all occurrences of the specified tag and returning attributes. If inner is true,
then it also returns the contents of the tag as entry "=" in the dictionary. Returns a list of dictionaries for
each tag found.
"""
r_tag = re.compile('<' + tag + '(|\\s((?:"[^"]*"|\'[^\']*\'|[^"\'/>]*)*?))' +
('(?:/>|>(.*?)(?:</' + tag + '\\s*>|(?=<' + tag + ')))' if inner else '/?>'), re.IGNORECASE)
r_attr = re.compile('\\b(\\w+)\\s*?=\\s*?("[^"]*?"|\'[^\']*?\'|[^\'">\\s]+)')
tags = r_tag.findall(html)
ret = []
for t in tags:
entry = {}
if inner:
entry['='] = t[2]
attributes = r_attr.findall(t[1])
for a in attributes:
entry[a[0].lower()] = remove_quotes(a[1])
ret.append(entry)
return ret
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment