Skip to content

Instantly share code, notes, and snippets.

@fdb
Created June 9, 2009 16:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fdb/126598 to your computer and use it in GitHub Desktop.
Save fdb/126598 to your computer and use it in GitHub Desktop.
Escape all the tags from the HTML string, leaving only the allowed tags. Also, convert links in the text to <a> tags.
import re
import urllib
import string
import cgi
import hashlib
word_split_re = re.compile(r'(\s+)')
LEADING_PUNCTUATION = ['(', '<', '&lt;']
TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '&gt;']
punctuation_re = re.compile('^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$' % \
('|'.join([re.escape(x) for x in LEADING_PUNCTUATION]),
'|'.join([re.escape(x) for x in TRAILING_PUNCTUATION])))
def escape(html):
"""Returns the given HTML with ampersands, quotes and carets encoded."""
return html.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;').replace("'", '&#39;')
def urlize(text, trim_url_limit=None):
"""
Converts any URLs in text into clickable links.
Works on http://, https://, www. links and links ending in .org, .net or
.com. Links can have trailing punctuation (periods, commas, close-parens)
and leading punctuation (opening parens) and it'll still do the right
thing.
If trim_url_limit is not None, the URLs in link text longer than this limit
will truncated to trim_url_limit-3 characters and appended with an elipsis.
Normal mode of operation:
>>> urlize(u'http://www.google.com/')
u'<a href="http://www.google.com/" rel="nofollow">http://www.google.com/</a>'
Trim_url_limit can be used to shorten links in the text:
>>> urlize(u'http://www.google.com/verylonglink', 25)
u'<a href="http://www.google.com/verylonglink" rel="nofollow">http://www.google.com/...</a>'
This is not supported. HTML should already be escaped:
>>> urlize(u'<a href="http://www.google.com/">test</a>')
u'<a href=&quot;http://www.google.com/&quot;&gt;test&lt;/a&gt;'
Punctuation is not included in the link:
>>> urlize(u'Google (http://www.google.com/)')
u'Google (<a href="http://www.google.com/" rel="nofollow">http://www.google.com/</a>)'
Javascript tricks are not converted. Note again, that you should escape HTML first:
>>> urlize(u'javascript:window.alert(\\\'hello\\\')')
u'javascript:window.alert(&#39;hello&#39;)'
"""
assert isinstance(text, unicode), "HTML needs to be unicode."
trim_url = lambda x, limit=trim_url_limit: limit is not None and (len(x) > limit and ('%s...' % x[:max(0, limit - 3)])) or x
words = word_split_re.split(text)
for i, word in enumerate(words):
match = None
if '.' in word or '@' in word or ':' in word:
match = punctuation_re.match(word)
if match:
lead, middle, trail = match.groups()
# Make URL we want to point to.
url = None
if middle.startswith('http://') or middle.startswith('https://'):
url = urllib.quote(middle, safe='/&=:;#?+*')
# Make link.
if url:
trimmed = trim_url(middle)
lead, trail = escape(lead), escape(trail)
url, trimmed = escape(url), escape(trimmed)
middle = '<a href="%s" rel="nofollow">%s</a>' % (url, trimmed)
words[i] = '%s%s%s' % (lead, middle, trail)
else:
words[i] = escape(word)
return u''.join(words)
def escape_html(html, allowed_tags=['strong', 'em', 'code']):
"""Escape all the tags from the HTML string, leaving only the allowed tags.
This method tries to be as stupid as possible, only allowing tags it explicitly
knows about, and escaping all the rest.
>>> escape_html(u'No tags here.')
u'No tags here.'
Default operation:
>>> escape_html(u'<strong>strong</strong>, <em>em</em>, <code>code</code>')
u'<strong>strong</strong>, <em>em</em>, <code>code</code>'
Change the allowed tags:
>>> escape_html(u'<b>bold</b>, <em>em</em>', allowed_tags=['b'])
u'<b>bold</b>, &lt;em&gt;em&lt;/em&gt;'
Invalid HTML stays invalid, but no tags leak through:
>>> escape_html(u'<em>em, <b>b')
u'<em>em, &lt;b&gt;b'
All data in the code block is also escaped:
>>> escape_html(u'<code>if 1 > 2: print "unpossible"</code>')
u'<code>if 1 &gt; 2: print "unpossible"</code>'
"""
assert isinstance(html, unicode), "HTML needs to be unicode."
# Build a list of all allowed tags and their SHA-1 digest. These digests
# are very unlikely to show up in a regular HTML message.
tag_list = []
for tag in allowed_tags:
open_tag = "<%s>" % tag
close_tag = "</%s>" % tag
tag_list.append( (open_tag, hashlib.sha1(open_tag).hexdigest()) )
tag_list.append( (close_tag, hashlib.sha1(close_tag).hexdigest()) )
# Replace all known tags with their SHA-1 counterparts.
for tag, hash in tag_list:
html = html.replace(tag, hash)
# Now escape the HTML. All unknown tags are now harmless.
html = cgi.escape(html)
# Replace all the SHA-1 digests with the original tags.
for tag, hash in tag_list:
html = html.replace(hash, tag)
return html
def process_html(html, allowed_tags=['strong', 'em', 'code'], convert_links=True, trim_url_limit=None):
"""Processes the given HTML.
Only the given tags are allowed to pass through, all the rest is escaped.
If enabled, links in the text are converted to <a href> tags.
>>> process_html(u'You <strong>must</strong> see this: http://www.google.com/.')
u'You <strong>must</strong> see this: <a href="http://www.google.com/" rel="nofollow">http://www.google.com/</a>.'
>>> process_html(u'<a href="http://evil.com/">Good</a>')
u'&lt;a href=&quot;http://evil.com/&quot;&amp;gt;Good&amp;lt;/a&amp;gt;'
"""
html = escape_html(html, allowed_tags)
if convert_links:
html = urlize(html, trim_url_limit=trim_url_limit)
return html
if __name__=='__main__':
import doctest
doctest.testmod()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment