fdb/escape_html.py

## escape_html.py
import re
import urllib
import string
import cgi
import hashlib

word_split_re = re.compile(r'(\s+)')

LEADING_PUNCTUATION  = ['(', '<', '&lt;']
TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '&gt;']
punctuation_re = re.compile('^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$' % \
    ('|'.join([re.escape(x) for x in LEADING_PUNCTUATION]),
    '|'.join([re.escape(x) for x in TRAILING_PUNCTUATION])))

def escape(html):
    """Returns the given HTML with ampersands, quotes and carets encoded."""
    return html.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;').replace("'", '&#39;')

def urlize(text, trim_url_limit=None):
    """
    Converts any URLs in text into clickable links.

    Works on http://, https://, www. links and links ending in .org, .net or
    .com. Links can have trailing punctuation (periods, commas, close-parens)
    and leading punctuation (opening parens) and it'll still do the right
    thing.

    If trim_url_limit is not None, the URLs in link text longer than this limit
    will truncated to trim_url_limit-3 characters and appended with an elipsis.

    Normal mode of operation:
    >>> urlize(u'http://www.google.com/')
    u'<a href="http://www.google.com/" rel="nofollow">http://www.google.com/</a>'

    Trim_url_limit can be used to shorten links in the text:
    >>> urlize(u'http://www.google.com/verylonglink', 25)
    u'<a href="http://www.google.com/verylonglink" rel="nofollow">http://www.google.com/...</a>'

    This is not supported. HTML should already be escaped:
    >>> urlize(u'<a href="http://www.google.com/">test</a>')
    u'<a href=&quot;http://www.google.com/&quot;&gt;test&lt;/a&gt;'

    Punctuation is not included in the link:
    >>> urlize(u'Google (http://www.google.com/)')
    u'Google (<a href="http://www.google.com/" rel="nofollow">http://www.google.com/</a>)'

    Javascript tricks are not converted. Note again, that you should escape HTML first:
    >>> urlize(u'javascript:window.alert(\\\'hello\\\')')
    u'javascript:window.alert(&#39;hello&#39;)'
    """

    assert isinstance(text, unicode), "HTML needs to be unicode."

    trim_url = lambda x, limit=trim_url_limit: limit is not None and (len(x) > limit and ('%s...' % x[:max(0, limit - 3)])) or x
    words = word_split_re.split(text)
    for i, word in enumerate(words):
        match = None
        if '.' in word or '@' in word or ':' in word:
            match = punctuation_re.match(word)
        if match:
            lead, middle, trail = match.groups()
            # Make URL we want to point to.
            url = None
            if middle.startswith('http://') or middle.startswith('https://'):
                url = urllib.quote(middle, safe='/&=:;#?+*')
            # Make link.
            if url:
                trimmed = trim_url(middle)
                lead, trail = escape(lead), escape(trail)
                url, trimmed = escape(url), escape(trimmed)
                middle = '<a href="%s" rel="nofollow">%s</a>' % (url, trimmed)
                words[i] = '%s%s%s' % (lead, middle, trail)
            else:
                words[i] = escape(word)
    return u''.join(words)

def escape_html(html, allowed_tags=['strong', 'em', 'code']):
    """Escape all the tags from the HTML string, leaving only the allowed tags.

    This method tries to be as stupid as possible, only allowing tags it explicitly
    knows about, and escaping all the rest.

    >>> escape_html(u'No tags here.')
    u'No tags here.'

    Default operation:
    >>> escape_html(u'<strong>strong</strong>, <em>em</em>, <code>code</code>')
    u'<strong>strong</strong>, <em>em</em>, <code>code</code>'

    Change the allowed tags:
    >>> escape_html(u'<b>bold</b>, <em>em</em>', allowed_tags=['b'])
    u'<b>bold</b>, &lt;em&gt;em&lt;/em&gt;'

    Invalid HTML stays invalid, but no tags leak through:
    >>> escape_html(u'<em>em, <b>b')
    u'<em>em, &lt;b&gt;b'

    All data in the code block is also escaped:
    >>> escape_html(u'<code>if 1 > 2: print "unpossible"</code>')
    u'<code>if 1 &gt; 2: print "unpossible"</code>'
    """

    assert isinstance(html, unicode), "HTML needs to be unicode."

    # Build a list of all allowed tags and their SHA-1 digest. These digests
    # are very unlikely to show up in a regular HTML message.
    tag_list = []
    for tag in allowed_tags:
        open_tag = "<%s>" % tag
        close_tag = "</%s>" % tag
        tag_list.append( (open_tag, hashlib.sha1(open_tag).hexdigest()) )
        tag_list.append( (close_tag, hashlib.sha1(close_tag).hexdigest()) )

    # Replace all known tags with their SHA-1 counterparts.
    for tag, hash in tag_list:
        html = html.replace(tag, hash)

    # Now escape the HTML. All unknown tags are now harmless.
    html = cgi.escape(html)

    # Replace all the SHA-1 digests with the original tags.
    for tag, hash in tag_list:
        html = html.replace(hash, tag)

    return html

def process_html(html, allowed_tags=['strong', 'em', 'code'], convert_links=True, trim_url_limit=None):
    """Processes the given HTML.

    Only the given tags are allowed to pass through, all the rest is escaped.
    If enabled, links in the text are converted to <a href> tags.

    >>> process_html(u'You <strong>must</strong> see this: http://www.google.com/.')
    u'You <strong>must</strong> see this: <a href="http://www.google.com/" rel="nofollow">http://www.google.com/</a>.'


    >>> process_html(u'<a href="http://evil.com/">Good</a>')
    u'&lt;a href=&quot;http://evil.com/&quot;&amp;gt;Good&amp;lt;/a&amp;gt;'
    """

    html = escape_html(html, allowed_tags)
    if convert_links:
        html = urlize(html, trim_url_limit=trim_url_limit)
    return html

if __name__=='__main__':
    import doctest
    doctest.testmod()
	import re
	import urllib
	import string
	import cgi
	import hashlib

	word_split_re = re.compile(r'(\s+)')

	LEADING_PUNCTUATION = ['(', '<', '<']
	TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '>']
	punctuation_re = re.compile('^(?P<lead>(?:%s))(?P<middle>.?)(?P<trail>(?:%s)*)$' % \
	('\|'.join([re.escape(x) for x in LEADING_PUNCTUATION]),
	'\|'.join([re.escape(x) for x in TRAILING_PUNCTUATION])))

	def escape(html):
	"""Returns the given HTML with ampersands, quotes and carets encoded."""
	return html.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''')

	def urlize(text, trim_url_limit=None):
	"""
	Converts any URLs in text into clickable links.

	Works on http://, https://, www. links and links ending in .org, .net or
	.com. Links can have trailing punctuation (periods, commas, close-parens)
	and leading punctuation (opening parens) and it'll still do the right
	thing.

	If trim_url_limit is not None, the URLs in link text longer than this limit
	will truncated to trim_url_limit-3 characters and appended with an elipsis.

	Normal mode of operation:
	>>> urlize(u'http://www.google.com/')
	u'<a href="http://www.google.com/" rel="nofollow">http://www.google.com/</a>'

	Trim_url_limit can be used to shorten links in the text:
	>>> urlize(u'http://www.google.com/verylonglink', 25)
	u'<a href="http://www.google.com/verylonglink" rel="nofollow">http://www.google.com/...</a>'

	This is not supported. HTML should already be escaped:
	>>> urlize(u'<a href="http://www.google.com/">test</a>')
	u'<a href="http://www.google.com/">test</a>'

	Punctuation is not included in the link:
	>>> urlize(u'Google (http://www.google.com/)')
	u'Google (<a href="http://www.google.com/" rel="nofollow">http://www.google.com/</a>)'

	Javascript tricks are not converted. Note again, that you should escape HTML first:
	>>> urlize(u'javascript:window.alert(\\\'hello\\\')')
	u'javascript:window.alert('hello')'
	"""

	assert isinstance(text, unicode), "HTML needs to be unicode."

	trim_url = lambda x, limit=trim_url_limit: limit is not None and (len(x) > limit and ('%s...' % x[:max(0, limit - 3)])) or x
	words = word_split_re.split(text)
	for i, word in enumerate(words):
	match = None
	if '.' in word or '@' in word or ':' in word:
	match = punctuation_re.match(word)
	if match:
	lead, middle, trail = match.groups()
	# Make URL we want to point to.
	url = None
	if middle.startswith('http://') or middle.startswith('https://'):
	url = urllib.quote(middle, safe='/&=:;#?+*')
	# Make link.
	if url:
	trimmed = trim_url(middle)
	lead, trail = escape(lead), escape(trail)
	url, trimmed = escape(url), escape(trimmed)
	middle = '<a href="%s" rel="nofollow">%s</a>' % (url, trimmed)
	words[i] = '%s%s%s' % (lead, middle, trail)
	else:
	words[i] = escape(word)
	return u''.join(words)

	def escape_html(html, allowed_tags=['strong', 'em', 'code']):
	"""Escape all the tags from the HTML string, leaving only the allowed tags.

	This method tries to be as stupid as possible, only allowing tags it explicitly
	knows about, and escaping all the rest.

	>>> escape_html(u'No tags here.')
	u'No tags here.'

	Default operation:
	>>> escape_html(u'<strong>strong</strong>, <em>em</em>, <code>code</code>')
	u'<strong>strong</strong>, <em>em</em>, <code>code</code>'

	Change the allowed tags:
	>>> escape_html(u'<b>bold</b>, <em>em</em>', allowed_tags=['b'])
	u'<b>bold</b>, <em>em</em>'

	Invalid HTML stays invalid, but no tags leak through:
	>>> escape_html(u'<em>em, <b>b')
	u'<em>em, <b>b'

	All data in the code block is also escaped:
	>>> escape_html(u'<code>if 1 > 2: print "unpossible"</code>')
	u'<code>if 1 > 2: print "unpossible"</code>'
	"""

	assert isinstance(html, unicode), "HTML needs to be unicode."

	# Build a list of all allowed tags and their SHA-1 digest. These digests
	# are very unlikely to show up in a regular HTML message.
	tag_list = []
	for tag in allowed_tags:
	open_tag = "<%s>" % tag
	close_tag = "</%s>" % tag
	tag_list.append( (open_tag, hashlib.sha1(open_tag).hexdigest()) )
	tag_list.append( (close_tag, hashlib.sha1(close_tag).hexdigest()) )

	# Replace all known tags with their SHA-1 counterparts.
	for tag, hash in tag_list:
	html = html.replace(tag, hash)

	# Now escape the HTML. All unknown tags are now harmless.
	html = cgi.escape(html)

	# Replace all the SHA-1 digests with the original tags.
	for tag, hash in tag_list:
	html = html.replace(hash, tag)

	return html

	def process_html(html, allowed_tags=['strong', 'em', 'code'], convert_links=True, trim_url_limit=None):
	"""Processes the given HTML.

	Only the given tags are allowed to pass through, all the rest is escaped.
	If enabled, links in the text are converted to <a href> tags.

	>>> process_html(u'You <strong>must</strong> see this: http://www.google.com/.')
	u'You <strong>must</strong> see this: <a href="http://www.google.com/" rel="nofollow">http://www.google.com/</a>.'


	>>> process_html(u'<a href="http://evil.com/">Good</a>')
	u'<a href="http://evil.com/"&gt;Good&lt;/a&gt;'
	"""

	html = escape_html(html, allowed_tags)
	if convert_links:
	html = urlize(html, trim_url_limit=trim_url_limit)
	return html

	if __name__=='__main__':
	import doctest
	doctest.testmod()