Skip to content

Instantly share code, notes, and snippets.

@diosmosis
Created July 8, 2011 15:10
Show Gist options
  • Save diosmosis/1072047 to your computer and use it in GitHub Desktop.
Save diosmosis/1072047 to your computer and use it in GitHub Desktop.
HTML Sanitizer written in Ruby
require 'set'
# A SanitizationLevel holds information regarding an XML node's start tag.
# Levels are used to incrementally build sanitized HTML while dealing with
# incomplete nodes (ie, nodes that are never closed).
class SanitizationLevel
# Constructor.
def initialize(tag_name, sanitized, start_pos, end_pos, tagsize)
@tag_name = tag_name
@sanitized = sanitized
@start_pos = start_pos
@end_pos = end_pos
@tagsize = tagsize
@closed = false
end
# Gets the tag name of the node this level is associated with.
# This property cannot be set.
def tag_name
@tag_name
end
# Gets the current sanitized string for all levels below this level. This
# property will only be modified after a node's children have been processed.
# This property cannot be set.
def sanitized
@sanitized
end
# Gets the starting index of this level's node in the *pre-escaped* HTML.
def start_pos
@start_pos
end
# Gets the current end position of this level's node in the *pre-escaped*
# HTML.
def end_pos
@end_pos
end
# Sets the current end position of this level's node in the *pre-escaped*
# HTML.
def end_pos=(value)
@end_pos = value
end
# Gets the size of the start tag of this level's node.
# This property cannot be set.
def tagsize
@tagsize
end
# A helper function that will extract the 'extra' information of this level's
# node start given the pre-escaped HTML. The 'extra' information is
# essentially the attributes of an XML node.
def extra(input)
input[@start_pos + 4 + @tag_name.size..@start_pos + @tagsize - 5]
end
end
# Regex that matches XML node attributes. The following groups are created by
# a match:
# 0. The attribute key.
# 1. The attribute value (or nil if there is none).
ATTRIBUTE_REGEX = /([a-zA-Z0-9_]+)(?:\s*=\s*((?:[a-zA-Z0-9_]+)|(?:"[^"]+")|(?:'[^']+'))?)?/
# Regex that matches XML node starts & node ends in escaped HTML. The following
# groups are created by a match:
# 0. the entire match string
# 1. the '/' in a node end, or nil f not present. used to tell if this is the
# start or end of a node.
# 2. the tag id, ie, 'html', 'body', 'input', etc.
# 3. everything after the tag id and before the ending >
# NOTE: Regexes are fragile. As such, this regex can parse node starts that use
# '>' in attribute values, but cannot deal with tags that spanmultiple
# lines.
NODE_TAG_REGEX = /(<(\/?)([a-zA-Z0-9_]+)((?:.(?!<)|(?:"[^"]*")|(?:'[^']*'))*)>)/
# Escapes '&', '"' and '\'' characters in a string and returns the escaped
# string. This is executed on chunks of text that are known not to be
# XML tags.
def escape_nontags(s)
return s.gsub(/(?:&(?![gl]t;))|["']/, {'&' => '&', '"' => '"', "'" => '''})
end
# Replaces escaped HTML ('<', '>', '&', '"', ''') with
# the unescaped equivalents. Used when a complete <pre> node is found.
# The supplied string is modified in-place.
def unescape(s)
s.gsub!(/&(?:(?:[lg]t)|(?:amp)|(?:quot)|(?:#39));/,
{'&lt;' => '<', '&gt;' => '>', '&amp;' => '&', '&quot;' => '"', '&#39;' => "'"})
end
# Sanitizes the attributes section of an XML node start by including attributes
# that are in the supplied whitelist set.
# The sanitized version is returned.
def sanitize_attributes(attributes, whitelist)
sanitized = ''
attributes.scan ATTRIBUTE_REGEX do |key, value|
if whitelist.include? key.downcase
sanitized << ' ' << key
sanitized << '=' << value if value
end
end
return sanitized
end
# The stack of sanitization levels. There will always be one level representing
# the document root.
# This class is really just a wrapper around an array.
class SanitizationStack
# Constructor. Creates the root santization level.
def initialize()
@levels = [SanitizationLevel.new('', '', -1, 0, 0)]
@last_closed_start = 0
end
# Adds a sanitization level using the found tag's name, start position (in
# the *pre-escaped* HTML), and the size of the tag found.
# The sanitized string is initialized to an empty string and the level's
# end position is initialized to the end of the start tag.
def push(tag_name, start_pos, tagsize)
@levels << SanitizationLevel.new(tag_name, '', start_pos, start_pos + tagsize, tagsize)
end
# Returns the top level in the stack.
def top
@levels.last
end
# Searches the stack from top to bottom looking for a SanitizationLevel with
# the given tag name. The index of the found level is returned. Nil is
# returned if nothing is found.
def most_recent_tag_start(tag_name)
@levels.rindex {|x| x.tag_name == tag_name}
end
# Flattens and sanitizes a node using its associated SanitizationLevel.
# Every level after 'from' that has not been sanitized is assumed to
# be an unmatched node start and left escaped.
def close_node(input, from, current, attribute_whitelist)
result = ''
parent_level = @levels[from - 1]
this_level = @levels[from]
# if there's any text before 'this' node and the current known end of the
# parent node, escape and append it
if this_level.start_pos != 0
in_between = input[parent_level.end_pos..this_level.start_pos-1]
result << escape_nontags(in_between)
end
# sanitize the attributes of 'this' node.
attributes = sanitize_attributes(this_level.extra(input), attribute_whitelist)
# output 'this' node's starting tag (unescaped)
result << '<' << this_level.tag_name << attributes << '>'
result << this_level.sanitized
# flatten every child of 'this' node. we treat those nodes as unmatched
# starts, leaving them escaped.
(from + 1..@levels.size-1).each do |i|
level = @levels[i]
level_parent = @levels[i-1]
# escape and append any text between level's start and its parent's end
result << escape_nontags(input[level_parent.end_pos..level.start_pos-1])
# escape and append the incomplete level's tag
result << escape_nontags(input[level.start_pos..level.start_pos + level.tagsize - 1])
result << level.sanitized
end
# append the text between the last parsed tag's end position and the
# current position, and this level's closing tag
result << escape_nontags(input[@levels.last.end_pos..current-1]) \
<< '</' << this_level.tag_name << '>'
# remove every level including and after 'from'
@levels[from..@levels.size-1] = []
# save the new top's end_pos. this must be remembered for 'finish', but is
# overwritten by 'sanitize'
@last_closed_start = parent_level.end_pos
return result
end
# Finishes up the sanitization process. Deals with any remaining unmatched
# nodes and any text after the last node start found.
# The sanitized text is returned.
def finish(input)
result = ''
# if there is more than one level in the stack, there are levels that have
# not been matched. the text they cover must be escaped and appended.
if @levels.size > 1
result << escape_nontags(input[0..@last_closed_start - 1])
end
# append the root level's sanitized text, then escape and append escape
# everything after it
top = @levels.last
result << top.sanitized << escape_nontags(input[top.end_pos..input.size-1])
return result
end
end
# Sanitizes HTML input using a Hash that maps allowed node names with sets of
# allowed attribute names.
#
# This is a whitelist sanitization function, so the HTML input is escaped
# before it is processed. This way, any dangerous node types you have not
# considered will be dealt with.
#
# In addition to sanitization, this function will do some mild formatting
# and is able to recognize and santize malformed HTML.
#
# Implementation Notes:
# * The sanitized string is built incrementally, thus allowing the use of
# a stack instead of a tree structure. This strategy is faster than
# creating an entire tree and more powerful than a simple regex search.
# * The regex used to parse XML node tags is not able to parse multi-line
# tags. *Tags must therefore exist on one line.*
def sanitize_html(input, whitelist)
levelstack = SanitizationStack.new
# escape ALL < and > characters. xml nodes with tag types that exist in the
# whitelist will be unescaped below.
input.gsub!(/[<>]/, {'<' => '&lt;', '>' => '&gt;'})
# process everything that looks like an XML node tag
input.scan NODE_TAG_REGEX do |tag, slash, tag_name, extra|
next unless whitelist.has_key? tag_name.downcase
current = Regexp.last_match.offset(0).first
has_last_slash = (extra and extra.length > 0 and extra[-1] == '/')
is_start_node = (slash.empty? and not has_last_slash)
if is_start_node
# tag is a node start (ie, <abc>)
levelstack.push(tag_name, current, tag.size)
elsif has_last_slash
# tag is a whole node (ie, <abc/>)
top = levelstack.top
# append everything between the end of the last found tag and this tag to
# the parent's sanitized string
top.sanitized << input[top.end_pos..current-1] if current != 0
# append the tag with sanitized attributes to the parent's sanitized
# string
top.sanitized << '<' << tag_name << sanitize_attributes(extra, whitelist[tag_name]) << '/>'
# update the ending position of the parent of the node
top.end_pos = current + tag.size
elsif not extra or extra.empty?
# tag is a node end (ie, </abc>)
# find the index of the sanitization level that this closing tag
# references. if none can be found, this is an *unmatched node end*,
# so we ignore it.
start_i = levelstack.most_recent_tag_start(tag_name)
if start_i
# flatten and remove the sub-tree that starts from start_i
flattened = levelstack.close_node(input, start_i, current, whitelist[tag_name])
# if we're closing a <pre> node, unescape everything the node contains.
if tag_name == 'pre'
unescape(flattened)
end
# append the flattened string to the parent's sanitized string and
# update its end position.
top = levelstack.top
top.sanitized << flattened
top.end_pos = current + tag.size
end
end
end
# finish up: add everything after last found tag and make sure to deal with
# any lingering unmatched node starts
return levelstack.finish(input)
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment