Created
July 21, 2009 11:33
-
-
Save mrchrisadams/151293 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This module turns html that's been submitted using a rich text editor like the | |
# YUI editor and strips out all html tags except a small sub group (like <br>, <p>) | |
# and converts the remaining markup tags into new line (\n) elemnets for use | |
# in plain text emails.. | |
# | |
module EmailSanitizer | |
def format_user_text(input) | |
output = "<p>#{input.strip}</p>" | |
# do some formatting | |
output.gsub!(/\r\n/, "\n") # remove CRLFs | |
output.gsub!(/^$\s*/m, "\n") # remove blank lins | |
output.gsub!(/\n{3,}/, "\n\n") # replace \n\n\n... with \n\n | |
output.gsub!(/\n\n/, '</p><p>') # embed stuff in paragraphs | |
output.gsub!(/\n/, '<br/>') # nl2br | |
sanitize_fu output | |
end | |
# apply sanitize_fu, then filter through further RegExp's to make | |
# output safe to use in plain text email | |
def sanitize_text_for_email(input) | |
#after running the format_user_text method... | |
input = format_user_text(input) | |
# first convert paragraphs to line breaks | |
input.gsub!(/<p>|<\/p>/,"\n") | |
# next convert arbitrary linewrap from the RTE to spaces | |
input.gsub!(' ', ' ') | |
# and remove any hyperlinks | |
input.gsub!(/<\/a>|<a.*">/, '') | |
# then delete any random linebreaks | |
input.gsub!(/<br>/, '') | |
# finally normalise the distance between lines, to there are only ever | |
# 2 \n's at a time | |
input.gsub!(/(\n){2,}/, "\n\n") | |
input | |
end | |
# Clear out all tags except for a whitelist defined by okTags | |
# Adapted from http://ideoplex.com/id/1138/sanitize-html-in-ruby | |
def sanitize_fu(html, okTags = 'a href, b, br, p, i, em') | |
# no closing tag necessary for these | |
soloTags = ["br","hr"] | |
# Build hash of allowed tags with allowed attributes | |
tags = okTags.downcase().split(',').collect!{ |s| s.split(' ') } | |
allowed = Hash.new | |
tags.each do |s| | |
key = s.shift | |
allowed[key] = s | |
end | |
# Analyze all <> elements | |
stack = Array.new | |
result = html.gsub( /(<.*?>)/m ) do | element | | |
if element =~ /\A<\/(\w+)/ then | |
# </tag> | |
tag = $1.downcase | |
if allowed.include?(tag) && stack.include?(tag) then | |
# If allowed and on the stack | |
# Then pop down the stack | |
top = stack.pop | |
out = "</#{top}>" | |
until top == tag do | |
top = stack.pop | |
out << "</#{top}>" | |
end | |
out | |
end | |
elsif element =~ /\A<(\w+)\s*\/>/ | |
# <tag /> | |
tag = $1.downcase | |
if allowed.include?(tag) then | |
"<#{tag} />" | |
end | |
elsif element =~ /\A<(\w+)/ then | |
# <tag ...> | |
tag = $1.downcase | |
if allowed.include?(tag) then | |
if ! soloTags.include?(tag) then | |
stack.push(tag) | |
end | |
if allowed[tag].length == 0 then | |
# no allowed attributes | |
"<#{tag}>" | |
else | |
# allowed attributes? | |
out = "<#{tag}" | |
while ( $' =~ /(\w+)=("[^"]+")/ ) | |
attr = $1.downcase | |
valu = $2 | |
if allowed[tag].include?(attr) then | |
out << " #{attr}=#{valu}" | |
end | |
end | |
out << ">" | |
end | |
end | |
end | |
end | |
# eat up unmatched leading > | |
while result.sub!(/\A([^<]*)>/m) { $1 } do end | |
# eat up unmatched trailing < | |
while result.sub!(/<([^>]*)\Z/m) { $1 } do end | |
# clean up the stack | |
if stack.length > 0 then | |
result << "</#{stack.reverse.join('></')}>" | |
end | |
result | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment