rhulse/html_parser_docs.rb

## html_parser_docs.rb
# This code is used to 'line parse' schedules that are create in MS Word.
# It build on the gist http://gist.github.com/552955

# Once the HTML is cleaned up in core, it is passed into a class
# based on the type of document.

# Each class works through a schedule document line-by-line, determining
# the context - what is the day and event

# These events are stored and imported into the main CMS.

# The checksum routine provides an internal check to ensure that
# the data collected and converted into objects is accurate.

# I consider this code to be a 'glorious' hack. It is stable most
# of the time, parsing documents with a set structure.

# It is finely tuned for our specific purpose, thus if the document does
# not match the expected structure, it fails.

# The code is provided in the hope that someone may find the techniques developed

# Earlier versions of this code (3) were written in PHP and have been in weekly use
# since at least 2005.

# This is the third generation of Ruby based code.

# At the end are some classes that we used re-parse schedules from live
# site so we could insert this information into our new Rails-based CMS.

# Richard Hulse. 27 August 2010

# Copyright (c) Radio New Zealand Limited 2010

# MIT license
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:

# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

require 'html_parser_core'

class ScheduleParser < ParserCore
  def initialize()
    # REGEXS for detecting the current state and context
    @line_has_date_regexp = /((saturday|sunday|monday|tuesday|wednesday|thursday|friday)(?:,?)\s+\d{1,2}(\w{2})? \w{3,9}( \d{4})?)/i
    @line_is_event_heading_regexp = /<p><b>/i
    @time_delimiter = '\.'

    # tracking the count of days (used as a checksum - should be 7)
    @day_count = 0
    @expected_day_count = 7
    # the hash holds data for the current week, keyed by current_event_id (epoch time)
    @week = {}
    # the current html being worked on
    @html = ''
    super
  end

  # these are called in derived classes to do extra specific things
  def pre_tidy_cleanup
  end

  def post_tidy_cleanup
  end

  # this is called to change lines if they need it
  # prior to being checked for context
  def pre_test_cleanup(line)
    line
  end

  # line content which we may want to skip over
  # i.e. not include in the parse
  def is_content_to_skip(line)
  end

  # the main parsing method
  def parse(dirty_html)
    @html = dirty_html.clone
    @week = {}

    pre_tidy_cleanup
    @html = tidy_html(@html)
    post_tidy_cleanup

    current_date_id = ''
    current_event_id = ''
    current_date_string = ''
    date = Time.now

    index = 0
    last_hour = 0

    @html.each_line do |line|
      if is_content_to_skip line
        next
      end

      line = pre_test_cleanup line

      case line
        # the date title for a page
        when @line_has_date_regexp :
          @day_count += 1
          index = 0 # reset the index for the start of a new day
          last_hour = 0 # new day

          current_date_string = $1
          date = Time.parse(current_date_string)

          # a numerical version of the date to base the day's events on
          current_date_id = date.to_i

        when @line_is_event_heading_regexp :
          line =~ /((\d{1,2})#{@time_delimiter}(\d{2}))/
          this_hour = $2.to_i
          this_minute = $3.to_i
          this_time = "%02d.%02d" % [this_hour, this_minute] # '%I.%M' format for testing

          if index == 0 && this_hour == 12  # then it is midnight
            this_hour = 0
          end

          index += 1

          if last_hour > this_hour # then we passed midday
            this_hour += 12
          end

          last_hour = this_hour # keep track of the previous hour

          # set the time to midnight + the time of the event
          current_event_id = current_date_id + (this_hour * 60 * 60) + (this_minute * 60)

          # midnight is the next day, so we have to fix the check data
          if this_hour == 24
            # midnight is 12 am in 12 hours clock time
            this_time = "00.00"
            # fake the captured string to the real (next) day
            current_date_string = Time.at(current_event_id).strftime('%A %e %B %Y').gsub(/\s+/, ' ')
          end

          @week[current_event_id] = {}
          @week[current_event_id][:body] ||= ''
          @week[current_event_id][:time]   = this_time
          @week[current_event_id][:day]    = current_date_string
          @week[current_event_id][:title]  = strip_time(strip_tags(line))

        else
          description = format_description(line)
          @week[current_event_id][:body] += description
      end

    end
    @week
  end

  def checksum_is_ok?
    # this checks that the date and time values extracted
    # matche the parsed and converted time values
    @week.sort.each do |index, event|
      was_time  = Time.at(index).strftime('%I.%M').strip
      am_pm     = Time.at(index).strftime('%p')

      # midnight is a special case
      if (was_time == '12.00') && (am_pm == 'AM')
        was_time = '00.00'
      end

      was_date = Time.at(index).strftime('%A %e %B %Y').gsub(/\s+/, ' ')

      if event[:time] != was_time
        @error_messages << "time parsing error on #{event[:title]}"
        @error_messages << " captured   => #{event[:time]} on #{event[:day]}"
        @error_messages << " calculated => #{was_time} on #{was_date}"
      end
      if event[:day] != Time.at(index).strftime('%A %e %B %Y').gsub(/\s+/, ' ')
        @error_messages << "day parsing error on on #{event[:title]} => #{event[:day]}"
      end
    end

    if @day_count != @expected_day_count
      @error_messages << "the number of days is #{@day_count} when it should be 7"
    end

    return true if @error_messages.count == 0
    false
  end

  def errors
    @error_messages
  end


end

class NationalScheduleParser < ScheduleParser
  def initialize
    super
  end

  def pre_tidy_cleanup
    @html.gsub! /<strong>/          , '<b>'
    @html.gsub! /<\/strong>/         , '</b>'
    @html.gsub! /<em>/               , '<em>'
    @html.gsub! /<\/em>/             , '</em>'
  end

  def post_tidy_cleanup
    email_regex = /Email:\s+((\w|\-|\_|\.)+\@((\w|\-|\_)+\.)+[a-zA-Z]{2,})/i

    #remove email address lines
    @html.gsub! email_regex              , ''

    # Fix all night programe title
    @html.gsub! /<p>(12\.\d{2}) All Night Programme/, '<p><b>\1 All Night Programme</b>'

    @html.gsub! /<p><\/p>/               , ''

    # misc - fix butted times
    @html.gsub! /(\d)am /          , '\1 am '
    @html.gsub! /(\d)pm /          , '\1 pm '
  end

  def is_content_to_skip(line)
    case line
    when /RADIO NEW ZEALAND NATIONAL Programme Listing/ :
      true
    else
      false
    end
  end

  def parse(html)
    super(html)
  end

end

class NationalScheduleLiveParser < ScheduleParser
  def initialize
    super
    @line_is_event_heading_regexp = /<h4>/i
    @expected_day_count = 1
    @time_delimiter = ':'
  end

  def pre_tidy_cleanup
    @html.gsub! /h4>&nbsp;/ , 'h4>'
  end

  def post_tidy_cleanup

    # Fix all night programe title
    @html.gsub! /<p>(\d{1,2}:\d{2}) All Night Programme/, '<h4>\1 All Night Programme'

		# fix up some old style schedules - see skip below
		if @html =~ /<h2>Programme Schedule<\/h2>/
			@html.gsub! /<h3>/, '<h2>'
			@html.gsub! /<\/h3>/, '</h2>'
		end

  end

  def is_content_to_skip(line)
    case line
    when /RADIO NEW ZEALAND NATIONAL Programme Listing/ :
      true
    when /If you wish to adapt our programme schedules/ :
      true
    when /Programme Schedules are licensed under the Creative Commons/ :
      true
    when /Please identify us as author/
			true
		when /<h2>Programme Schedule<\/h2>/
      true
    when /<h3>/
      true
    else
      false
    end
  end

  def parse(html)
    super(html)
  end

end

class ConcertScheduleParser < ScheduleParser
  def initialize
    super
    # date regexp must have an h2 at the start to seperate items that have
    # text such as "recorded on Saturday 14 May 2010"
    # this makes the parser think we are on a new day
#    @line_has_date_regexp = /h2.*((saturday|sunday|monday|tuesday|wednesday|thursday|friday) \d{1,2}(\w{2})? \w{3,9}( \d{4})?)/i
    # Concert uses a different heading style
    @line_is_event_heading_regexp = /<h3>/i
  end

  def post_tidy_cleanup
    @html.gsub! /<p><b>Disc 1/, '<p><b>12.00 - Disc 1'
  end

  def is_content_to_skip(line)
    case line
    when /News &amp; Weather:/i
      true
    when /New Zealand Music Week/i
      true
    else
      false
    end
  end

  def parse(html)
    super(html)
  end

end

class ConcertScheduleLiveParser < ScheduleParser
  def initialize
    super
    # date regexp must have an h2 at the start to seperate items that have
    # text such as "recorded on Saturday 14 May 2010"
    # this makes the parser think we are on a new day
    @line_has_date_regexp = /<h2.*((saturday|sunday|monday|tuesday|wednesday|thursday|friday)\s+\d{1,2}(\w{2})? \w{3,9}( \d{4})?)/i
    # Concert uses a different heading style
    @line_is_event_heading_regexp = /<h4>/i
    @expected_day_count = 1
    @time_delimiter = ':'
  end

  def pre_test_cleanup(line)
    if line =~ /approx /
      line = strip_tags line
      line = line.gsub /approx/, '(approx)'
      line = "<p>#{line}</p>\n"
    elsif line =~ /Disc 1/
      line = "<p>12.00 - Disc 1</p>\n"
    end

    line
  end

  def is_content_to_skip(line)
    case line
    when /News &amp; Weather: / :
      true
    when /If you wish to adapt our programme schedules/ :
      true
    when /Programme Schedules are licensed under the Creative Commons/ :
      true
    when /Please identify us as author/
      true
    when /Waitangi Day/
      true
    when /<h3>/
      true
    when /New Zealand Music Week/i
      true
    else
      false
    end
  end

  def parse(html)
    super(html)
  end

end

# A description is a non heading line that describes the programme or its contents
def format_description(description, process_brackets=true)
  description.gsub!(/<h2>/ , '')
  description.gsub!(/<br[^>]*?\/>/ , '')
  description.gsub!(/ RR/ , '')
  description.gsub!(/((\d{1,2})(\.|:)(\d{2})) / , '<strong>\2:\4</strong> ')
  if process_brackets
    description.gsub!(/\(([\w|\s|\.|\,]+)\)/ , '<em>(\1)</em>')
  end

  description.strip
end

def check_for_smarttags(html)
  html =~ /<\/o:smarttagtype>/
end
	# This code is used to 'line parse' schedules that are create in MS Word.
	# It build on the gist http://gist.github.com/552955

	# Once the HTML is cleaned up in core, it is passed into a class
	# based on the type of document.

	# Each class works through a schedule document line-by-line, determining
	# the context - what is the day and event

	# These events are stored and imported into the main CMS.

	# The checksum routine provides an internal check to ensure that
	# the data collected and converted into objects is accurate.

	# I consider this code to be a 'glorious' hack. It is stable most
	# of the time, parsing documents with a set structure.

	# It is finely tuned for our specific purpose, thus if the document does
	# not match the expected structure, it fails.

	# The code is provided in the hope that someone may find the techniques developed

	# Earlier versions of this code (3) were written in PHP and have been in weekly use
	# since at least 2005.

	# This is the third generation of Ruby based code.

	# At the end are some classes that we used re-parse schedules from live
	# site so we could insert this information into our new Rails-based CMS.

	# Richard Hulse. 27 August 2010

	# Copyright (c) Radio New Zealand Limited 2010

	# MIT license
	# Permission is hereby granted, free of charge, to any person obtaining
	# a copy of this software and associated documentation files (the
	# "Software"), to deal in the Software without restriction, including
	# without limitation the rights to use, copy, modify, merge, publish,
	# distribute, sublicense, and/or sell copies of the Software, and to
	# permit persons to whom the Software is furnished to do so, subject to
	# the following conditions:

	# The above copyright notice and this permission notice shall be
	# included in all copies or substantial portions of the Software.

	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
	# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
	# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
	# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

	require 'html_parser_core'

	class ScheduleParser < ParserCore
	def initialize()
	# REGEXS for detecting the current state and context
	@line_has_date_regexp = /((saturday\|sunday\|monday\|tuesday\|wednesday\|thursday\|friday)(?:,?)\s+\d{1,2}(\w{2})? \w{3,9}( \d{4})?)/i
	@line_is_event_heading_regexp = /<p><b>/i
	@time_delimiter = '\.'

	# tracking the count of days (used as a checksum - should be 7)
	@day_count = 0
	@expected_day_count = 7
	# the hash holds data for the current week, keyed by current_event_id (epoch time)
	@week = {}
	# the current html being worked on
	@html = ''
	super
	end

	# these are called in derived classes to do extra specific things
	def pre_tidy_cleanup
	end

	def post_tidy_cleanup
	end

	# this is called to change lines if they need it
	# prior to being checked for context
	def pre_test_cleanup(line)
	line
	end

	# line content which we may want to skip over
	# i.e. not include in the parse
	def is_content_to_skip(line)
	end

	# the main parsing method
	def parse(dirty_html)
	@html = dirty_html.clone
	@week = {}

	pre_tidy_cleanup
	@html = tidy_html(@html)
	post_tidy_cleanup

	current_date_id = ''
	current_event_id = ''
	current_date_string = ''
	date = Time.now

	index = 0
	last_hour = 0

	@html.each_line do \|line\|
	if is_content_to_skip line
	next
	end

	line = pre_test_cleanup line

	case line
	# the date title for a page
	when @line_has_date_regexp :
	@day_count += 1
	index = 0 # reset the index for the start of a new day
	last_hour = 0 # new day

	current_date_string = $1
	date = Time.parse(current_date_string)

	# a numerical version of the date to base the day's events on
	current_date_id = date.to_i

	when @line_is_event_heading_regexp :
	line =~ /((\d{1,2})#{@time_delimiter}(\d{2}))/
	this_hour = $2.to_i
	this_minute = $3.to_i
	this_time = "%02d.%02d" % [this_hour, this_minute] # '%I.%M' format for testing

	if index == 0 && this_hour == 12 # then it is midnight
	this_hour = 0
	end

	index += 1

	if last_hour > this_hour # then we passed midday
	this_hour += 12
	end

	last_hour = this_hour # keep track of the previous hour

	# set the time to midnight + the time of the event
	current_event_id = current_date_id + (this_hour * 60 * 60) + (this_minute * 60)

	# midnight is the next day, so we have to fix the check data
	if this_hour == 24
	# midnight is 12 am in 12 hours clock time
	this_time = "00.00"
	# fake the captured string to the real (next) day
	current_date_string = Time.at(current_event_id).strftime('%A %e %B %Y').gsub(/\s+/, ' ')
	end

	@week[current_event_id] = {}
	@week[current_event_id][:body] \|\|= ''
	@week[current_event_id][:time] = this_time
	@week[current_event_id][:day] = current_date_string
	@week[current_event_id][:title] = strip_time(strip_tags(line))

	else
	description = format_description(line)
	@week[current_event_id][:body] += description
	end

	end
	@week
	end

	def checksum_is_ok?
	# this checks that the date and time values extracted
	# matche the parsed and converted time values
	@week.sort.each do \|index, event\|
	was_time = Time.at(index).strftime('%I.%M').strip
	am_pm = Time.at(index).strftime('%p')

	# midnight is a special case
	if (was_time == '12.00') && (am_pm == 'AM')
	was_time = '00.00'
	end

	was_date = Time.at(index).strftime('%A %e %B %Y').gsub(/\s+/, ' ')

	if event[:time] != was_time
	@error_messages << "time parsing error on #{event[:title]}"
	@error_messages << " captured => #{event[:time]} on #{event[:day]}"
	@error_messages << " calculated => #{was_time} on #{was_date}"
	end
	if event[:day] != Time.at(index).strftime('%A %e %B %Y').gsub(/\s+/, ' ')
	@error_messages << "day parsing error on on #{event[:title]} => #{event[:day]}"
	end
	end

	if @day_count != @expected_day_count
	@error_messages << "the number of days is #{@day_count} when it should be 7"
	end

	return true if @error_messages.count == 0
	false
	end

	def errors
	@error_messages
	end


	end

	class NationalScheduleParser < ScheduleParser
	def initialize
	super
	end

	def pre_tidy_cleanup
	@html.gsub! /<strong>/ , '<b>'
	@html.gsub! /<\/strong>/ , '</b>'
	@html.gsub! /<em>/ , '<em>'
	@html.gsub! /<\/em>/ , '</em>'
	end

	def post_tidy_cleanup
	email_regex = /Email:\s+((\w\|\-\|\_\|\.)+\@((\w\|\-\|\_)+\.)+[a-zA-Z]{2,})/i

	#remove email address lines
	@html.gsub! email_regex , ''

	# Fix all night programe title
	@html.gsub! /<p>(12\.\d{2}) All Night Programme/, '<p><b>\1 All Night Programme</b>'

	@html.gsub! /<p><\/p>/ , ''

	# misc - fix butted times
	@html.gsub! /(\d)am / , '\1 am '
	@html.gsub! /(\d)pm / , '\1 pm '
	end

	def is_content_to_skip(line)
	case line
	when /RADIO NEW ZEALAND NATIONAL Programme Listing/ :
	true
	else
	false
	end
	end

	def parse(html)
	super(html)
	end

	end

	class NationalScheduleLiveParser < ScheduleParser
	def initialize
	super
	@line_is_event_heading_regexp = /<h4>/i
	@expected_day_count = 1
	@time_delimiter = ':'
	end

	def pre_tidy_cleanup
	@html.gsub! /h4> / , 'h4>'
	end

	def post_tidy_cleanup

	# Fix all night programe title
	@html.gsub! /<p>(\d{1,2}:\d{2}) All Night Programme/, '<h4>\1 All Night Programme'

	# fix up some old style schedules - see skip below
	if @html =~ /<h2>Programme Schedule<\/h2>/
	@html.gsub! /<h3>/, '<h2>'
	@html.gsub! /<\/h3>/, '</h2>'
	end

	end

	def is_content_to_skip(line)
	case line
	when /RADIO NEW ZEALAND NATIONAL Programme Listing/ :
	true
	when /If you wish to adapt our programme schedules/ :
	true
	when /Programme Schedules are licensed under the Creative Commons/ :
	true
	when /Please identify us as author/
	true
	when /<h2>Programme Schedule<\/h2>/
	true
	when /<h3>/
	true
	else
	false
	end
	end

	def parse(html)
	super(html)
	end

	end

	class ConcertScheduleParser < ScheduleParser
	def initialize
	super
	# date regexp must have an h2 at the start to seperate items that have
	# text such as "recorded on Saturday 14 May 2010"
	# this makes the parser think we are on a new day
	# @line_has_date_regexp = /h2.*((saturday\|sunday\|monday\|tuesday\|wednesday\|thursday\|friday) \d{1,2}(\w{2})? \w{3,9}( \d{4})?)/i
	# Concert uses a different heading style
	@line_is_event_heading_regexp = /<h3>/i
	end

	def post_tidy_cleanup
	@html.gsub! /<p><b>Disc 1/, '<p><b>12.00 - Disc 1'
	end

	def is_content_to_skip(line)
	case line
	when /News & Weather:/i
	true
	when /New Zealand Music Week/i
	true
	else
	false
	end
	end

	def parse(html)
	super(html)
	end

	end

	class ConcertScheduleLiveParser < ScheduleParser
	def initialize
	super
	# date regexp must have an h2 at the start to seperate items that have
	# text such as "recorded on Saturday 14 May 2010"
	# this makes the parser think we are on a new day
	@line_has_date_regexp = /<h2.*((saturday\|sunday\|monday\|tuesday\|wednesday\|thursday\|friday)\s+\d{1,2}(\w{2})? \w{3,9}( \d{4})?)/i
	# Concert uses a different heading style
	@line_is_event_heading_regexp = /<h4>/i
	@expected_day_count = 1
	@time_delimiter = ':'
	end

	def pre_test_cleanup(line)
	if line =~ /approx /
	line = strip_tags line
	line = line.gsub /approx/, '(approx)'
	line = "<p>#{line}</p>\n"
	elsif line =~ /Disc 1/
	line = "<p>12.00 - Disc 1</p>\n"
	end

	line
	end

	def is_content_to_skip(line)
	case line
	when /News & Weather: / :
	true
	when /If you wish to adapt our programme schedules/ :
	true
	when /Programme Schedules are licensed under the Creative Commons/ :
	true
	when /Please identify us as author/
	true
	when /Waitangi Day/
	true
	when /<h3>/
	true
	when /New Zealand Music Week/i
	true
	else
	false
	end
	end

	def parse(html)
	super(html)
	end

	end

	# A description is a non heading line that describes the programme or its contents
	def format_description(description, process_brackets=true)
	description.gsub!(/<h2>/ , '')
	description.gsub!(/<br[^>]*?\/>/ , '')
	description.gsub!(/ RR/ , '')
	description.gsub!(/((\d{1,2})(\.\|:)(\d{2})) / , '<strong>\2:\4</strong> ')
	if process_brackets
	description.gsub!(/\(([\w\|\s\|\.\|\,]+)\)/ , '<em>(\1)</em>')
	end

	description.strip
	end

	def check_for_smarttags(html)
	html =~ /<\/o:smarttagtype>/
	end