rhulse/html_parser_core.rb

## html_parser_core.rb
# This code is designed to cleanup Word HTML to a very great extent.
# It is used at Radio NZ to cleanup documents created in Word prior to
# parsing them, line by line, to extract content.

# The Word content is pasted into WYSIWYG which is then HTTP POSTed into
# the app where the string is cleaned by this code.

# We add new lines after block elements because the next stage is a line-based
# parser

# NB: The text you paste into your WYSIWYG should NOT have any smarttags. These
# can be stripped from a document by going to Tools : Autocorrect : Smart Tags, and
# clicking on "Remove Smart Tags.

# A basic test for smart tags (so you can reject it) is:
#  def check_for_smarttags(html)
#    html =~ %r{</o:smarttagtype>}
#  end
#

# For an example of how it is used in practice see gist: http://gist.github.com/552971
#
# Richard Hulse. 27 August 2010

# Copyright (c) Radio New Zealand Limited 2010

# MIT license
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:

# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

require 'rubygems'
require 'sanitize'

class ParserCore
  def initialize
    @error_messages = []
  end


 # The elements and attributes to keep are in Sanitizer syntax


  def tidy_html(dirty_html, elements = ['p', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'], attributes={})
    html = dirty_html.clone

    html.gsub! /[\n|\r]/                , ''

    # remove some word cruft
    html.gsub! /<o:p>/i                   , ''
    html.gsub! /<\/o:p>/i                 , ''

    # fix some broken tags
    html.gsub! /<\s+/i                   , '<'

    # Tidy is used prior to santize as it is more robust in certain edge cases.
    # You MAY be able to remove this. YMMV.
    html = tidy(html)

    # keep only the things we want.
    html = strip_tags(html, elements, attributes )

    # tags with spaces must be reduced to a space (not butted up).
    # if they are not then characters get run together
    # e.g. '<p><b>this</b> <b>is a test.</b></p>
    # MS word does output this sort of thing
    html.gsub! /<i>\s+<\/i>/            , ' '
    html.gsub! /<b>\s+<\/b>/            , ' '
    html.gsub! /<\/b>\s+<b>/            , ' '

    # remove redundant empty tags
    html.gsub! /<i><\/i>/               , ''
    html.gsub! /<b><\/b>/               , ''
    html.gsub! /<p><\/p>/               , ''
    html.gsub! /<p><b><\/b><\/p>/       , ''

    # butt up any remianing tags
    html.gsub! /&nbsp;/                 , ' '
    html.gsub! />\s+</                  , '><'

    # add new lines at the end of block elements
    html.gsub! /<\/(p|h\d|dt|dd|dl)>/   , '</\1>' + "\n"
    html.gsub! /<dl>/                   , '<dl>' + "\n"

    html
  end

  def strip_tags(dirty_html, elements=[], attributes={})
    html = Sanitize.clean( dirty_html, :elements => elements, :attributes => attributes)
    html.strip!

    html
  end

  def strip_time(dirty_html)
    text = dirty_html.clone
    text.gsub! /\d{1,2}(\.|:)\d{2}/, ''
    text.strip!
    text
  end

  def tidy(dirty_html)
    error_file = File.join(Rails.root, '/log/tidy_errors.log')
    tidy_options = '--word-2000 1 --indent 0 --bare 1 --wrap 0 --show-body-only 1 --drop-empty-paras 1 --force-output yes -utf8'
    cleaned = nil
    tidy = IO.popen("tidy -f #{error_file} #{tidy_options}", 'w+')
    begin
        tidy.write(dirty_html)
        tidy.close_write
        cleaned_html = tidy.read
        tidy.close_read
    rescue Errno::EPIPE
        $stderr.print "Running 'tidy' failed: " + $!
        tidy.close
    end
    return cleaned_html if cleaned_html and cleaned_html != ""
    return dirty_html
  end

end
	# This code is designed to cleanup Word HTML to a very great extent.
	# It is used at Radio NZ to cleanup documents created in Word prior to
	# parsing them, line by line, to extract content.

	# The Word content is pasted into WYSIWYG which is then HTTP POSTed into
	# the app where the string is cleaned by this code.

	# We add new lines after block elements because the next stage is a line-based
	# parser

	# NB: The text you paste into your WYSIWYG should NOT have any smarttags. These
	# can be stripped from a document by going to Tools : Autocorrect : Smart Tags, and
	# clicking on "Remove Smart Tags.

	# A basic test for smart tags (so you can reject it) is:
	# def check_for_smarttags(html)
	# html =~ %r{</o:smarttagtype>}
	# end
	#

	# For an example of how it is used in practice see gist: http://gist.github.com/552971
	#
	# Richard Hulse. 27 August 2010

	# Copyright (c) Radio New Zealand Limited 2010

	# MIT license
	# Permission is hereby granted, free of charge, to any person obtaining
	# a copy of this software and associated documentation files (the
	# "Software"), to deal in the Software without restriction, including
	# without limitation the rights to use, copy, modify, merge, publish,
	# distribute, sublicense, and/or sell copies of the Software, and to
	# permit persons to whom the Software is furnished to do so, subject to
	# the following conditions:

	# The above copyright notice and this permission notice shall be
	# included in all copies or substantial portions of the Software.

	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
	# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
	# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
	# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

	require 'rubygems'
	require 'sanitize'

	class ParserCore
	def initialize
	@error_messages = []
	end


	# The elements and attributes to keep are in Sanitizer syntax



	def tidy_html(dirty_html, elements = ['p', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'], attributes={})
	html = dirty_html.clone

	html.gsub! /[\n\|\r]/ , ''

	# remove some word cruft
	html.gsub! /<o:p>/i , ''
	html.gsub! /<\/o:p>/i , ''

	# fix some broken tags
	html.gsub! /<\s+/i , '<'

	# Tidy is used prior to santize as it is more robust in certain edge cases.
	# You MAY be able to remove this. YMMV.
	html = tidy(html)

	# keep only the things we want.
	html = strip_tags(html, elements, attributes )

	# tags with spaces must be reduced to a space (not butted up).
	# if they are not then characters get run together
	# e.g. '<p><b>this</b> <b>is a test.</b></p>
	# MS word does output this sort of thing
	html.gsub! /<i>\s+<\/i>/ , ' '
	html.gsub! /<b>\s+<\/b>/ , ' '
	html.gsub! /<\/b>\s+<b>/ , ' '

	# remove redundant empty tags
	html.gsub! /<i><\/i>/ , ''
	html.gsub! /<b><\/b>/ , ''
	html.gsub! /<p><\/p>/ , ''
	html.gsub! /<p><b><\/b><\/p>/ , ''

	# butt up any remianing tags
	html.gsub! / / , ' '
	html.gsub! />\s+</ , '><'

	# add new lines at the end of block elements
	html.gsub! /<\/(p\|h\d\|dt\|dd\|dl)>/ , '</\1>' + "\n"
	html.gsub! /<dl>/ , '<dl>' + "\n"

	html
	end

	def strip_tags(dirty_html, elements=[], attributes={})
	html = Sanitize.clean( dirty_html, :elements => elements, :attributes => attributes)
	html.strip!

	html
	end

	def strip_time(dirty_html)
	text = dirty_html.clone
	text.gsub! /\d{1,2}(\.\|:)\d{2}/, ''
	text.strip!
	text
	end

	def tidy(dirty_html)
	error_file = File.join(Rails.root, '/log/tidy_errors.log')
	tidy_options = '--word-2000 1 --indent 0 --bare 1 --wrap 0 --show-body-only 1 --drop-empty-paras 1 --force-output yes -utf8'
	cleaned = nil
	tidy = IO.popen("tidy -f #{error_file} #{tidy_options}", 'w+')
	begin
	tidy.write(dirty_html)
	tidy.close_write
	cleaned_html = tidy.read
	tidy.close_read
	rescue Errno::EPIPE
	$stderr.print "Running 'tidy' failed: " + $!
	tidy.close
	end
	return cleaned_html if cleaned_html and cleaned_html != ""
	return dirty_html
	end

	end