diosmosis/sanitize_html.rb

## sanitize_html.rb
require 'set'

# A SanitizationLevel holds information regarding an XML node's start tag.
# Levels are used to incrementally build sanitized HTML while dealing with
# incomplete nodes (ie, nodes that are never closed).
class SanitizationLevel

  # Constructor.
  def initialize(tag_name, sanitized, start_pos, end_pos, tagsize)
    @tag_name = tag_name
    @sanitized = sanitized
    @start_pos = start_pos
    @end_pos = end_pos
    @tagsize = tagsize
    @closed = false
  end

  # Gets the tag name of the node this level is associated with.
  # This property cannot be set.
  def tag_name
    @tag_name
  end

  # Gets the current sanitized string for all levels below this level. This
  # property will only be modified after a node's children have been processed.
  # This property cannot be set.
  def sanitized
    @sanitized
  end

  # Gets the starting index of this level's node in the *pre-escaped* HTML.
  def start_pos
    @start_pos
  end

  # Gets the current end position of this level's node in the *pre-escaped*
  # HTML.
  def end_pos
    @end_pos
  end

  # Sets the current end position of this level's node in the *pre-escaped*
  # HTML.
  def end_pos=(value)
    @end_pos = value
  end

  # Gets the size of the start tag of this level's node.
  # This property cannot be set.
  def tagsize
    @tagsize
  end

  # A helper function that will extract the 'extra' information of this level's
  # node start given the pre-escaped HTML. The 'extra' information is
  # essentially the attributes of an XML node.
  def extra(input)
    input[@start_pos + 4 + @tag_name.size..@start_pos + @tagsize - 5]
  end
end

# Regex that matches XML node attributes. The following groups are created by
# a match:
# 0. The attribute key.
# 1. The attribute value (or nil if there is none).
ATTRIBUTE_REGEX = /([a-zA-Z0-9_]+)(?:\s*=\s*((?:[a-zA-Z0-9_]+)|(?:"[^"]+")|(?:'[^']+'))?)?/

# Regex that matches XML node starts & node ends in escaped HTML. The following
# groups are created by a match:
# 0. the entire match string
# 1. the '/' in a node end, or nil f not present. used to tell if this is the
#    start or end of a node.
# 2. the tag id, ie, 'html', 'body', 'input', etc.
# 3. everything after the tag id and before the ending &gt;
# NOTE: Regexes are fragile. As such, this regex can parse node starts that use
#       '>' in attribute values, but cannot deal with tags that spanmultiple
#       lines.
NODE_TAG_REGEX = /(&lt;(\/?)([a-zA-Z0-9_]+)((?:.(?!&lt;)|(?:"[^"]*")|(?:'[^']*'))*)&gt;)/

# Escapes '&', '"' and '\'' characters in a string and returns the escaped
# string. This is executed on chunks of text that are known not to be
# XML tags.
def escape_nontags(s)
  return s.gsub(/(?:&(?![gl]t;))|["']/, {'&' => '&amp;', '"' => '&quot;', "'" => '&#39;'})
end

# Replaces escaped HTML ('&lt;', '&gt;', '&amp;', '&quot;', '&#39;') with
# the unescaped equivalents. Used when a complete <pre> node is found.
# The supplied string is modified in-place.
def unescape(s)
  s.gsub!(/&(?:(?:[lg]t)|(?:amp)|(?:quot)|(?:#39));/,
    {'&lt;' => '<', '&gt;' => '>', '&amp;' => '&', '&quot;' => '"', '&#39;' => "'"})
end

# Sanitizes the attributes section of an XML node start by including attributes
# that are in the supplied whitelist set.
# The sanitized version is returned.
def sanitize_attributes(attributes, whitelist)
  sanitized = ''

  attributes.scan ATTRIBUTE_REGEX do |key, value|
    if whitelist.include? key.downcase
      sanitized << ' ' << key

      sanitized << '=' << value if value
    end
  end

  return sanitized
end

# The stack of sanitization levels. There will always be one level representing
# the document root.
# This class is really just a wrapper around an array.
class SanitizationStack

  # Constructor. Creates the root santization level.
  def initialize()
    @levels = [SanitizationLevel.new('', '', -1, 0, 0)]
    @last_closed_start = 0
  end

  # Adds a sanitization level using the found tag's name, start position (in
  # the *pre-escaped* HTML), and the size of the tag found.
  # The sanitized string is initialized to an empty string and the level's
  # end position is initialized to the end of the start tag.
  def push(tag_name, start_pos, tagsize)
    @levels << SanitizationLevel.new(tag_name, '', start_pos, start_pos + tagsize, tagsize)
  end

  # Returns the top level in the stack.
  def top
    @levels.last
  end

  # Searches the stack from top to bottom looking for a SanitizationLevel with
  # the given tag name. The index of the found level is returned. Nil is
  # returned if nothing is found.
  def most_recent_tag_start(tag_name)
    @levels.rindex {|x| x.tag_name == tag_name}
  end

  # Flattens and sanitizes a node using its associated SanitizationLevel.
  # Every level after 'from' that has not been sanitized is assumed to
  # be an unmatched node start and left escaped.
  def close_node(input, from, current, attribute_whitelist)
    result = ''

    parent_level = @levels[from - 1]
    this_level = @levels[from]

    # if there's any text before 'this' node and the current known end of the
    # parent node, escape and append it
    if this_level.start_pos != 0
      in_between = input[parent_level.end_pos..this_level.start_pos-1]
      result << escape_nontags(in_between)
    end

    # sanitize the attributes of 'this' node.
    attributes = sanitize_attributes(this_level.extra(input), attribute_whitelist)

    # output 'this' node's starting tag (unescaped)
    result << '<' << this_level.tag_name << attributes << '>'

    result << this_level.sanitized

    # flatten every child of 'this' node. we treat those nodes as unmatched
    # starts, leaving them escaped.
    (from + 1..@levels.size-1).each do |i|
      level = @levels[i]
      level_parent = @levels[i-1]

      # escape and append any text between level's start and its parent's end
      result << escape_nontags(input[level_parent.end_pos..level.start_pos-1])

      # escape and append the incomplete level's tag
      result << escape_nontags(input[level.start_pos..level.start_pos + level.tagsize - 1])

      result << level.sanitized
    end

    # append the text between the last parsed tag's end position and the
    # current position, and this level's closing tag
    result << escape_nontags(input[@levels.last.end_pos..current-1]) \
           << '</' << this_level.tag_name << '>'

    # remove every level including and after 'from'
    @levels[from..@levels.size-1] = []

    # save the new top's end_pos. this must be remembered for 'finish', but is
    # overwritten by 'sanitize'
    @last_closed_start = parent_level.end_pos

    return result
  end

  # Finishes up the sanitization process. Deals with any remaining unmatched
  # nodes and any text after the last node start found.
  # The sanitized text is returned.
  def finish(input)
    result = ''

    # if there is more than one level in the stack, there are levels that have
    # not been matched. the text they cover must be escaped and appended.
    if @levels.size > 1
      result << escape_nontags(input[0..@last_closed_start - 1])
    end

    # append the root level's sanitized text, then escape and append escape
    # everything after it
    top = @levels.last
    result << top.sanitized << escape_nontags(input[top.end_pos..input.size-1])

    return result
  end
end

# Sanitizes HTML input using a Hash that maps allowed node names with sets of
# allowed attribute names.
#
# This is a whitelist sanitization function, so the HTML input is escaped
# before it is processed. This way, any dangerous node types you have not
# considered will be dealt with.
#
# In addition to sanitization, this function will do some mild formatting
# and is able to recognize and santize malformed HTML.
#
# Implementation Notes:
# * The sanitized string is built incrementally, thus allowing the use of
#   a stack instead of a tree structure. This strategy is faster than
#   creating an entire tree and more powerful than a simple regex search.
# * The regex used to parse XML node tags is not able to parse multi-line
#   tags. *Tags must therefore exist on one line.*
def sanitize_html(input, whitelist)
  levelstack = SanitizationStack.new

  # escape ALL < and > characters. xml nodes with tag types that exist in the
  # whitelist will be unescaped below.
  input.gsub!(/[<>]/, {'<' => '&lt;', '>' => '&gt;'})

  # process everything that looks like an XML node tag
  input.scan NODE_TAG_REGEX do |tag, slash, tag_name, extra|
    next unless whitelist.has_key? tag_name.downcase

    current = Regexp.last_match.offset(0).first

    has_last_slash = (extra and extra.length > 0 and extra[-1] == '/')
    is_start_node = (slash.empty? and not has_last_slash)

    if is_start_node
      # tag is a node start (ie, <abc>)

      levelstack.push(tag_name, current, tag.size)
    elsif has_last_slash
      # tag is a whole node (ie, <abc/>)

      top = levelstack.top

      # append everything between the end of the last found tag and this tag to
      # the parent's sanitized string
      top.sanitized << input[top.end_pos..current-1] if current != 0

      # append the tag with sanitized attributes to the parent's sanitized
      # string
      top.sanitized << '<' << tag_name << sanitize_attributes(extra, whitelist[tag_name]) << '/>'

      # update the ending position of the parent of the node
      top.end_pos = current + tag.size
    elsif not extra or extra.empty?
      # tag is a node end (ie, </abc>)

      # find the index of the sanitization level that this closing tag
      # references. if none can be found, this is an *unmatched node end*,
      # so we ignore it.
      start_i = levelstack.most_recent_tag_start(tag_name)
      if start_i
        # flatten and remove the sub-tree that starts from start_i
        flattened = levelstack.close_node(input, start_i, current, whitelist[tag_name])

        # if we're closing a <pre> node, unescape everything the node contains.
        if tag_name == 'pre'
          unescape(flattened)
        end

        # append the flattened string to the parent's sanitized string and
        # update its end position.
        top = levelstack.top
        top.sanitized << flattened
        top.end_pos = current + tag.size
      end
    end
  end

  # finish up: add everything after last found tag and make sure to deal with
  # any lingering unmatched node starts
  return levelstack.finish(input)
end
	require 'set'

	# A SanitizationLevel holds information regarding an XML node's start tag.
	# Levels are used to incrementally build sanitized HTML while dealing with
	# incomplete nodes (ie, nodes that are never closed).
	class SanitizationLevel

	# Constructor.
	def initialize(tag_name, sanitized, start_pos, end_pos, tagsize)
	@tag_name = tag_name
	@sanitized = sanitized
	@start_pos = start_pos
	@end_pos = end_pos
	@tagsize = tagsize
	@closed = false
	end

	# Gets the tag name of the node this level is associated with.
	# This property cannot be set.
	def tag_name
	@tag_name
	end

	# Gets the current sanitized string for all levels below this level. This
	# property will only be modified after a node's children have been processed.
	# This property cannot be set.
	def sanitized
	@sanitized
	end

	# Gets the starting index of this level's node in the pre-escaped HTML.
	def start_pos
	@start_pos
	end

	# Gets the current end position of this level's node in the pre-escaped
	# HTML.
	def end_pos
	@end_pos
	end

	# Sets the current end position of this level's node in the pre-escaped
	# HTML.
	def end_pos=(value)
	@end_pos = value
	end

	# Gets the size of the start tag of this level's node.
	# This property cannot be set.
	def tagsize
	@tagsize
	end

	# A helper function that will extract the 'extra' information of this level's
	# node start given the pre-escaped HTML. The 'extra' information is
	# essentially the attributes of an XML node.
	def extra(input)
	input[@start_pos + 4 + @tag_name.size..@start_pos + @tagsize - 5]
	end
	end

	# Regex that matches XML node attributes. The following groups are created by
	# a match:
	# 0. The attribute key.
	# 1. The attribute value (or nil if there is none).
	ATTRIBUTE_REGEX = /([a-zA-Z0-9_]+)(?:\s=\s((?:[a-zA-Z0-9_]+)\|(?:"[^"]+")\|(?:'[^']+'))?)?/

	# Regex that matches XML node starts & node ends in escaped HTML. The following
	# groups are created by a match:
	# 0. the entire match string
	# 1. the '/' in a node end, or nil f not present. used to tell if this is the
	# start or end of a node.
	# 2. the tag id, ie, 'html', 'body', 'input', etc.
	# 3. everything after the tag id and before the ending >
	# NOTE: Regexes are fragile. As such, this regex can parse node starts that use
	# '>' in attribute values, but cannot deal with tags that spanmultiple
	# lines.
	NODE_TAG_REGEX = /(<(\/?)([a-zA-Z0-9_]+)((?:.(?!<)\|(?:"[^"]")\|(?:'[^']'))*)>)/

	# Escapes '&', '"' and '\'' characters in a string and returns the escaped
	# string. This is executed on chunks of text that are known not to be
	# XML tags.
	def escape_nontags(s)
	return s.gsub(/(?:&(?![gl]t;))\|["']/, {'&' => '&', '"' => '"', "'" => '''})
	end

	# Replaces escaped HTML ('<', '>', '&', '"', ''') with
	# the unescaped equivalents. Used when a complete <pre> node is found.
	# The supplied string is modified in-place.
	def unescape(s)
	s.gsub!(/&(?:(?:[lg]t)\|(?:amp)\|(?:quot)\|(?:#39));/,
	{'<' => '<', '>' => '>', '&' => '&', '"' => '"', ''' => "'"})
	end

	# Sanitizes the attributes section of an XML node start by including attributes
	# that are in the supplied whitelist set.
	# The sanitized version is returned.
	def sanitize_attributes(attributes, whitelist)
	sanitized = ''

	attributes.scan ATTRIBUTE_REGEX do \|key, value\|
	if whitelist.include? key.downcase
	sanitized << ' ' << key

	sanitized << '=' << value if value
	end
	end

	return sanitized
	end

	# The stack of sanitization levels. There will always be one level representing
	# the document root.
	# This class is really just a wrapper around an array.
	class SanitizationStack

	# Constructor. Creates the root santization level.
	def initialize()
	@levels = [SanitizationLevel.new('', '', -1, 0, 0)]
	@last_closed_start = 0
	end

	# Adds a sanitization level using the found tag's name, start position (in
	# the pre-escaped HTML), and the size of the tag found.
	# The sanitized string is initialized to an empty string and the level's
	# end position is initialized to the end of the start tag.
	def push(tag_name, start_pos, tagsize)
	@levels << SanitizationLevel.new(tag_name, '', start_pos, start_pos + tagsize, tagsize)
	end

	# Returns the top level in the stack.
	def top
	@levels.last
	end

	# Searches the stack from top to bottom looking for a SanitizationLevel with
	# the given tag name. The index of the found level is returned. Nil is
	# returned if nothing is found.
	def most_recent_tag_start(tag_name)
	@levels.rindex {\|x\| x.tag_name == tag_name}
	end

	# Flattens and sanitizes a node using its associated SanitizationLevel.
	# Every level after 'from' that has not been sanitized is assumed to
	# be an unmatched node start and left escaped.
	def close_node(input, from, current, attribute_whitelist)
	result = ''

	parent_level = @levels[from - 1]
	this_level = @levels[from]

	# if there's any text before 'this' node and the current known end of the
	# parent node, escape and append it
	if this_level.start_pos != 0
	in_between = input[parent_level.end_pos..this_level.start_pos-1]
	result << escape_nontags(in_between)
	end

	# sanitize the attributes of 'this' node.
	attributes = sanitize_attributes(this_level.extra(input), attribute_whitelist)

	# output 'this' node's starting tag (unescaped)
	result << '<' << this_level.tag_name << attributes << '>'

	result << this_level.sanitized

	# flatten every child of 'this' node. we treat those nodes as unmatched
	# starts, leaving them escaped.
	(from + 1..@levels.size-1).each do \|i\|
	level = @levels[i]
	level_parent = @levels[i-1]

	# escape and append any text between level's start and its parent's end
	result << escape_nontags(input[level_parent.end_pos..level.start_pos-1])

	# escape and append the incomplete level's tag
	result << escape_nontags(input[level.start_pos..level.start_pos + level.tagsize - 1])

	result << level.sanitized
	end

	# append the text between the last parsed tag's end position and the
	# current position, and this level's closing tag
	result << escape_nontags(input[@levels.last.end_pos..current-1]) \
	<< '</' << this_level.tag_name << '>'

	# remove every level including and after 'from'
	@levels[from..@levels.size-1] = []

	# save the new top's end_pos. this must be remembered for 'finish', but is
	# overwritten by 'sanitize'
	@last_closed_start = parent_level.end_pos

	return result
	end

	# Finishes up the sanitization process. Deals with any remaining unmatched
	# nodes and any text after the last node start found.
	# The sanitized text is returned.
	def finish(input)
	result = ''

	# if there is more than one level in the stack, there are levels that have
	# not been matched. the text they cover must be escaped and appended.
	if @levels.size > 1
	result << escape_nontags(input[0..@last_closed_start - 1])
	end

	# append the root level's sanitized text, then escape and append escape
	# everything after it
	top = @levels.last
	result << top.sanitized << escape_nontags(input[top.end_pos..input.size-1])

	return result
	end
	end

	# Sanitizes HTML input using a Hash that maps allowed node names with sets of
	# allowed attribute names.
	#
	# This is a whitelist sanitization function, so the HTML input is escaped
	# before it is processed. This way, any dangerous node types you have not
	# considered will be dealt with.
	#
	# In addition to sanitization, this function will do some mild formatting
	# and is able to recognize and santize malformed HTML.
	#
	# Implementation Notes:
	# * The sanitized string is built incrementally, thus allowing the use of
	# a stack instead of a tree structure. This strategy is faster than
	# creating an entire tree and more powerful than a simple regex search.
	# * The regex used to parse XML node tags is not able to parse multi-line
	# tags. Tags must therefore exist on one line.
	def sanitize_html(input, whitelist)
	levelstack = SanitizationStack.new

	# escape ALL < and > characters. xml nodes with tag types that exist in the
	# whitelist will be unescaped below.
	input.gsub!(/[<>]/, {'<' => '<', '>' => '>'})

	# process everything that looks like an XML node tag
	input.scan NODE_TAG_REGEX do \|tag, slash, tag_name, extra\|
	next unless whitelist.has_key? tag_name.downcase

	current = Regexp.last_match.offset(0).first

	has_last_slash = (extra and extra.length > 0 and extra[-1] == '/')
	is_start_node = (slash.empty? and not has_last_slash)

	if is_start_node
	# tag is a node start (ie, <abc>)

	levelstack.push(tag_name, current, tag.size)
	elsif has_last_slash
	# tag is a whole node (ie, <abc/>)

	top = levelstack.top

	# append everything between the end of the last found tag and this tag to
	# the parent's sanitized string
	top.sanitized << input[top.end_pos..current-1] if current != 0

	# append the tag with sanitized attributes to the parent's sanitized
	# string
	top.sanitized << '<' << tag_name << sanitize_attributes(extra, whitelist[tag_name]) << '/>'

	# update the ending position of the parent of the node
	top.end_pos = current + tag.size
	elsif not extra or extra.empty?
	# tag is a node end (ie, </abc>)

	# find the index of the sanitization level that this closing tag
	# references. if none can be found, this is an unmatched node end,
	# so we ignore it.
	start_i = levelstack.most_recent_tag_start(tag_name)
	if start_i
	# flatten and remove the sub-tree that starts from start_i
	flattened = levelstack.close_node(input, start_i, current, whitelist[tag_name])

	# if we're closing a <pre> node, unescape everything the node contains.
	if tag_name == 'pre'
	unescape(flattened)
	end

	# append the flattened string to the parent's sanitized string and
	# update its end position.
	top = levelstack.top
	top.sanitized << flattened
	top.end_pos = current + tag.size
	end
	end
	end

	# finish up: add everything after last found tag and make sure to deal with
	# any lingering unmatched node starts
	return levelstack.finish(input)
	end