/gist:139987

## gistfile1.rb
#
# This function takes messy Word HTML pasted into a WYSIWYG and cleans it up
# It leaves the tags and attributes specified in the params
# Copyright (c) 2009, Radio New Zealand
# Released under the MIT license

require 'rubygems'
require 'sanitize'

def clean_up_word_html(html, elements = ['p', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'], attributes={})

  email_regex = /<p>Email:\s+((\w|\-|\_|\.)+\@((\w|\-|\_)+\.)+[a-zA-Z]{2,})/i

  html.gsub! /[\n|\r]/    , ''

  # keep only the things we want.
  html = Sanitize.clean( html, :elements => elements, :attributes => attributes )

  # butt up any tags
  html.gsub! /&nbsp;/                 , ' '
  html.gsub! />\s+</                  , '><'

  #remove email address lines
  html.gsub! email_regex              , '<p>'

  # post sanitize cleanup of empty blocks
  # the order of removal is import - this is the way word stacks these elements
  html.gsub! /<i><\/i>/               , ''
  html.gsub! /<b><\/b>/               , ''
  html.gsub! /<\/b><b>/               , ''
  html.gsub! /<p><\/p>/               , ''
  html.gsub! /<p><b><\/b><\/p>/       , ''

  # misc - fix butted times
  html.gsub! /(\d)am /          , '\1 am '
  html.gsub! /(\d)pm /          , '\1 pm '
  # misc - remove multiple space that may cause doc specific regexs to fail (in dates for example)
  html.gsub! /\s+/                  , ' '

  # add new lines at the end of lines
  html.gsub! /<\/(p|h\d|dt|dd|dl)>/, '</\1>' + "\n"
  html.gsub! /<dl>/             , '<dl>' + "\n"

  html
end
	#
	# This function takes messy Word HTML pasted into a WYSIWYG and cleans it up
	# It leaves the tags and attributes specified in the params
	# Copyright (c) 2009, Radio New Zealand
	# Released under the MIT license

	require 'rubygems'
	require 'sanitize'

	def clean_up_word_html(html, elements = ['p', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'], attributes={})

	email_regex = /<p>Email:\s+((\w\|\-\|\_\|\.)+\@((\w\|\-\|\_)+\.)+[a-zA-Z]{2,})/i

	html.gsub! /[\n\|\r]/ , ''

	# keep only the things we want.
	html = Sanitize.clean( html, :elements => elements, :attributes => attributes )

	# butt up any tags
	html.gsub! / / , ' '
	html.gsub! />\s+</ , '><'

	#remove email address lines
	html.gsub! email_regex , '<p>'

	# post sanitize cleanup of empty blocks
	# the order of removal is import - this is the way word stacks these elements
	html.gsub! /<i><\/i>/ , ''
	html.gsub! /<b><\/b>/ , ''
	html.gsub! /<\/b><b>/ , ''
	html.gsub! /<p><\/p>/ , ''
	html.gsub! /<p><b><\/b><\/p>/ , ''

	# misc - fix butted times
	html.gsub! /(\d)am / , '\1 am '
	html.gsub! /(\d)pm / , '\1 pm '
	# misc - remove multiple space that may cause doc specific regexs to fail (in dates for example)
	html.gsub! /\s+/ , ' '

	# add new lines at the end of lines
	html.gsub! /<\/(p\|h\d\|dt\|dd\|dl)>/, '</\1>' + "\n"
	html.gsub! /<dl>/ , '<dl>' + "\n"

	html
	end