mrchrisadams/email_sanitizer.rb

## email_sanitizer.rb
# This module turns html that's been submitted using a rich text editor like the
# YUI editor and strips out all html tags except a small sub group (like <br>, <p>)
# and converts the remaining markup tags into new line (\n) elemnets for use
# in plain text emails..
#

module EmailSanitizer

  def format_user_text(input)
    output = "<p>#{input.strip}</p>"

    # do some formatting
    output.gsub!(/\r\n/, "\n")       # remove CRLFs
    output.gsub!(/^$\s*/m, "\n")     # remove blank lins
    output.gsub!(/\n{3,}/, "\n\n")   # replace \n\n\n... with \n\n
    output.gsub!(/\n\n/, '</p><p>')  # embed stuff in paragraphs
    output.gsub!(/\n/, '<br/>')      # nl2br

    sanitize_fu output
  end

  # apply sanitize_fu, then filter through further RegExp's to make
  # output safe to use in plain text email
  def sanitize_text_for_email(input)

    #after running the format_user_text method...
    input = format_user_text(input)

    # first convert paragraphs to line breaks
    input.gsub!(/<p>|<\/p>/,"\n")

    # next convert arbitrary linewrap from the RTE to spaces
    input.gsub!('&nbsp;', ' ')

    # and remove any hyperlinks
    input.gsub!(/<\/a>|<a.*">/, '')

    # then delete any random linebreaks
    input.gsub!(/<br>/, '')

    # finally normalise the distance between lines, to there are only ever
    # 2 \n's at a time
    input.gsub!(/(\n){2,}/, "\n\n")

    input

  end

  # Clear out all tags except for a whitelist defined by okTags
  # Adapted from http://ideoplex.com/id/1138/sanitize-html-in-ruby

  def sanitize_fu(html, okTags = 'a href, b, br, p, i, em')
    # no closing tag necessary for these
    soloTags = ["br","hr"]

    # Build hash of allowed tags with allowed attributes
    tags = okTags.downcase().split(',').collect!{ |s| s.split(' ') }
    allowed = Hash.new
    tags.each do |s|
      key = s.shift
      allowed[key] = s
    end

    # Analyze all <> elements
    stack = Array.new
    result = html.gsub( /(<.*?>)/m ) do | element |
      if element =~ /\A<\/(\w+)/ then
        # </tag>
        tag = $1.downcase
        if allowed.include?(tag) && stack.include?(tag) then
          # If allowed and on the stack
          # Then pop down the stack
          top = stack.pop
          out = "</#{top}>"
          until top == tag do
            top = stack.pop
            out << "</#{top}>"
          end
          out
        end
      elsif element =~ /\A<(\w+)\s*\/>/
        # <tag />
        tag = $1.downcase
        if allowed.include?(tag) then
          "<#{tag} />"
        end
      elsif element =~ /\A<(\w+)/ then
        # <tag ...>
        tag = $1.downcase
        if allowed.include?(tag) then
          if ! soloTags.include?(tag) then
            stack.push(tag)
          end
          if allowed[tag].length == 0 then
            # no allowed attributes
            "<#{tag}>"
          else
            # allowed attributes?
            out = "<#{tag}"
            while ( $' =~ /(\w+)=("[^"]+")/ )
              attr = $1.downcase
              valu = $2
              if allowed[tag].include?(attr) then
                out << " #{attr}=#{valu}"
              end
            end
            out << ">"
          end
        end
      end
    end

    # eat up unmatched leading >
    while result.sub!(/\A([^<]*)>/m) { $1 } do end

    # eat up unmatched trailing <
    while result.sub!(/<([^>]*)\Z/m) { $1 } do end

    # clean up the stack
    if stack.length > 0 then
      result << "</#{stack.reverse.join('></')}>"
    end

    result
  end


end
	# This module turns html that's been submitted using a rich text editor like the
	# YUI editor and strips out all html tags except a small sub group (like <br>, <p>)
	# and converts the remaining markup tags into new line (\n) elemnets for use
	# in plain text emails..
	#

	module EmailSanitizer

	def format_user_text(input)
	output = "<p>#{input.strip}</p>"

	# do some formatting
	output.gsub!(/\r\n/, "\n") # remove CRLFs
	output.gsub!(/^$\s*/m, "\n") # remove blank lins
	output.gsub!(/\n{3,}/, "\n\n") # replace \n\n\n... with \n\n
	output.gsub!(/\n\n/, '</p><p>') # embed stuff in paragraphs
	output.gsub!(/\n/, '<br/>') # nl2br

	sanitize_fu output
	end

	# apply sanitize_fu, then filter through further RegExp's to make
	# output safe to use in plain text email
	def sanitize_text_for_email(input)

	#after running the format_user_text method...
	input = format_user_text(input)

	# first convert paragraphs to line breaks
	input.gsub!(/<p>\|<\/p>/,"\n")

	# next convert arbitrary linewrap from the RTE to spaces
	input.gsub!(' ', ' ')

	# and remove any hyperlinks
	input.gsub!(/<\/a>\|<a.*">/, '')

	# then delete any random linebreaks
	input.gsub!(/<br>/, '')

	# finally normalise the distance between lines, to there are only ever
	# 2 \n's at a time
	input.gsub!(/(\n){2,}/, "\n\n")

	input

	end

	# Clear out all tags except for a whitelist defined by okTags
	# Adapted from http://ideoplex.com/id/1138/sanitize-html-in-ruby

	def sanitize_fu(html, okTags = 'a href, b, br, p, i, em')
	# no closing tag necessary for these
	soloTags = ["br","hr"]

	# Build hash of allowed tags with allowed attributes
	tags = okTags.downcase().split(',').collect!{ \|s\| s.split(' ') }
	allowed = Hash.new
	tags.each do \|s\|
	key = s.shift
	allowed[key] = s
	end

	# Analyze all <> elements
	stack = Array.new
	result = html.gsub( /(<.*?>)/m ) do \| element \|
	if element =~ /\A<\/(\w+)/ then
	# </tag>
	tag = $1.downcase
	if allowed.include?(tag) && stack.include?(tag) then
	# If allowed and on the stack
	# Then pop down the stack
	top = stack.pop
	out = "</#{top}>"
	until top == tag do
	top = stack.pop
	out << "</#{top}>"
	end
	out
	end
	elsif element =~ /\A<(\w+)\s*\/>/
	# <tag />
	tag = $1.downcase
	if allowed.include?(tag) then
	"<#{tag} />"
	end
	elsif element =~ /\A<(\w+)/ then
	# <tag ...>
	tag = $1.downcase
	if allowed.include?(tag) then
	if ! soloTags.include?(tag) then
	stack.push(tag)
	end
	if allowed[tag].length == 0 then
	# no allowed attributes
	"<#{tag}>"
	else
	# allowed attributes?
	out = "<#{tag}"
	while ( $' =~ /(\w+)=("[^"]+")/ )
	attr = $1.downcase
	valu = $2
	if allowed[tag].include?(attr) then
	out << " #{attr}=#{valu}"
	end
	end
	out << ">"
	end
	end
	end
	end

	# eat up unmatched leading >
	while result.sub!(/\A([^<]*)>/m) { $1 } do end

	# eat up unmatched trailing <
	while result.sub!(/<([^>]*)\Z/m) { $1 } do end

	# clean up the stack
	if stack.length > 0 then
	result << "</#{stack.reverse.join('></')}>"
	end

	result
	end


	end