Create a gist now

Instantly share code, notes, and snippets.

# This code is used to 'line parse' schedules that are create in MS Word.
# It build on the gist http://gist.github.com/552955
# Once the HTML is cleaned up in core, it is passed into a class
# based on the type of document.
# Each class works through a schedule document line-by-line, determining
# the context - what is the day and event
# These events are stored and imported into the main CMS.
# The checksum routine provides an internal check to ensure that
# the data collected and converted into objects is accurate.
# I consider this code to be a 'glorious' hack. It is stable most
# of the time, parsing documents with a set structure.
# It is finely tuned for our specific purpose, thus if the document does
# not match the expected structure, it fails.
# The code is provided in the hope that someone may find the techniques developed
# Earlier versions of this code (3) were written in PHP and have been in weekly use
# since at least 2005.
# This is the third generation of Ruby based code.
# At the end are some classes that we used re-parse schedules from live
# site so we could insert this information into our new Rails-based CMS.
# Richard Hulse. 27 August 2010
# Copyright (c) Radio New Zealand Limited 2010
# MIT license
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
require 'html_parser_core'
class ScheduleParser < ParserCore
def initialize()
# REGEXS for detecting the current state and context
@line_has_date_regexp = /((saturday|sunday|monday|tuesday|wednesday|thursday|friday)(?:,?)\s+\d{1,2}(\w{2})? \w{3,9}( \d{4})?)/i
@line_is_event_heading_regexp = /<p><b>/i
@time_delimiter = '\.'
# tracking the count of days (used as a checksum - should be 7)
@day_count = 0
@expected_day_count = 7
# the hash holds data for the current week, keyed by current_event_id (epoch time)
@week = {}
# the current html being worked on
@html = ''
super
end
# these are called in derived classes to do extra specific things
def pre_tidy_cleanup
end
def post_tidy_cleanup
end
# this is called to change lines if they need it
# prior to being checked for context
def pre_test_cleanup(line)
line
end
# line content which we may want to skip over
# i.e. not include in the parse
def is_content_to_skip(line)
end
# the main parsing method
def parse(dirty_html)
@html = dirty_html.clone
@week = {}
pre_tidy_cleanup
@html = tidy_html(@html)
post_tidy_cleanup
current_date_id = ''
current_event_id = ''
current_date_string = ''
date = Time.now
index = 0
last_hour = 0
@html.each_line do |line|
if is_content_to_skip line
next
end
line = pre_test_cleanup line
case line
# the date title for a page
when @line_has_date_regexp :
@day_count += 1
index = 0 # reset the index for the start of a new day
last_hour = 0 # new day
current_date_string = $1
date = Time.parse(current_date_string)
# a numerical version of the date to base the day's events on
current_date_id = date.to_i
when @line_is_event_heading_regexp :
line =~ /((\d{1,2})#{@time_delimiter}(\d{2}))/
this_hour = $2.to_i
this_minute = $3.to_i
this_time = "%02d.%02d" % [this_hour, this_minute] # '%I.%M' format for testing
if index == 0 && this_hour == 12 # then it is midnight
this_hour = 0
end
index += 1
if last_hour > this_hour # then we passed midday
this_hour += 12
end
last_hour = this_hour # keep track of the previous hour
# set the time to midnight + the time of the event
current_event_id = current_date_id + (this_hour * 60 * 60) + (this_minute * 60)
# midnight is the next day, so we have to fix the check data
if this_hour == 24
# midnight is 12 am in 12 hours clock time
this_time = "00.00"
# fake the captured string to the real (next) day
current_date_string = Time.at(current_event_id).strftime('%A %e %B %Y').gsub(/\s+/, ' ')
end
@week[current_event_id] = {}
@week[current_event_id][:body] ||= ''
@week[current_event_id][:time] = this_time
@week[current_event_id][:day] = current_date_string
@week[current_event_id][:title] = strip_time(strip_tags(line))
else
description = format_description(line)
@week[current_event_id][:body] += description
end
end
@week
end
def checksum_is_ok?
# this checks that the date and time values extracted
# matche the parsed and converted time values
@week.sort.each do |index, event|
was_time = Time.at(index).strftime('%I.%M').strip
am_pm = Time.at(index).strftime('%p')
# midnight is a special case
if (was_time == '12.00') && (am_pm == 'AM')
was_time = '00.00'
end
was_date = Time.at(index).strftime('%A %e %B %Y').gsub(/\s+/, ' ')
if event[:time] != was_time
@error_messages << "time parsing error on #{event[:title]}"
@error_messages << " captured => #{event[:time]} on #{event[:day]}"
@error_messages << " calculated => #{was_time} on #{was_date}"
end
if event[:day] != Time.at(index).strftime('%A %e %B %Y').gsub(/\s+/, ' ')
@error_messages << "day parsing error on on #{event[:title]} => #{event[:day]}"
end
end
if @day_count != @expected_day_count
@error_messages << "the number of days is #{@day_count} when it should be 7"
end
return true if @error_messages.count == 0
false
end
def errors
@error_messages
end
end
class NationalScheduleParser < ScheduleParser
def initialize
super
end
def pre_tidy_cleanup
@html.gsub! /<strong>/ , '<b>'
@html.gsub! /<\/strong>/ , '</b>'
@html.gsub! /<em>/ , '<em>'
@html.gsub! /<\/em>/ , '</em>'
end
def post_tidy_cleanup
email_regex = /Email:\s+((\w|\-|\_|\.)+\@((\w|\-|\_)+\.)+[a-zA-Z]{2,})/i
#remove email address lines
@html.gsub! email_regex , ''
# Fix all night programe title
@html.gsub! /<p>(12\.\d{2}) All Night Programme/, '<p><b>\1 All Night Programme</b>'
@html.gsub! /<p><\/p>/ , ''
# misc - fix butted times
@html.gsub! /(\d)am / , '\1 am '
@html.gsub! /(\d)pm / , '\1 pm '
end
def is_content_to_skip(line)
case line
when /RADIO NEW ZEALAND NATIONAL Programme Listing/ :
true
else
false
end
end
def parse(html)
super(html)
end
end
class NationalScheduleLiveParser < ScheduleParser
def initialize
super
@line_is_event_heading_regexp = /<h4>/i
@expected_day_count = 1
@time_delimiter = ':'
end
def pre_tidy_cleanup
@html.gsub! /h4>&nbsp;/ , 'h4>'
end
def post_tidy_cleanup
# Fix all night programe title
@html.gsub! /<p>(\d{1,2}:\d{2}) All Night Programme/, '<h4>\1 All Night Programme'
# fix up some old style schedules - see skip below
if @html =~ /<h2>Programme Schedule<\/h2>/
@html.gsub! /<h3>/, '<h2>'
@html.gsub! /<\/h3>/, '</h2>'
end
end
def is_content_to_skip(line)
case line
when /RADIO NEW ZEALAND NATIONAL Programme Listing/ :
true
when /If you wish to adapt our programme schedules/ :
true
when /Programme Schedules are licensed under the Creative Commons/ :
true
when /Please identify us as author/
true
when /<h2>Programme Schedule<\/h2>/
true
when /<h3>/
true
else
false
end
end
def parse(html)
super(html)
end
end
class ConcertScheduleParser < ScheduleParser
def initialize
super
# date regexp must have an h2 at the start to seperate items that have
# text such as "recorded on Saturday 14 May 2010"
# this makes the parser think we are on a new day
# @line_has_date_regexp = /h2.*((saturday|sunday|monday|tuesday|wednesday|thursday|friday) \d{1,2}(\w{2})? \w{3,9}( \d{4})?)/i
# Concert uses a different heading style
@line_is_event_heading_regexp = /<h3>/i
end
def post_tidy_cleanup
@html.gsub! /<p><b>Disc 1/, '<p><b>12.00 - Disc 1'
end
def is_content_to_skip(line)
case line
when /News &amp; Weather:/i
true
when /New Zealand Music Week/i
true
else
false
end
end
def parse(html)
super(html)
end
end
class ConcertScheduleLiveParser < ScheduleParser
def initialize
super
# date regexp must have an h2 at the start to seperate items that have
# text such as "recorded on Saturday 14 May 2010"
# this makes the parser think we are on a new day
@line_has_date_regexp = /<h2.*((saturday|sunday|monday|tuesday|wednesday|thursday|friday)\s+\d{1,2}(\w{2})? \w{3,9}( \d{4})?)/i
# Concert uses a different heading style
@line_is_event_heading_regexp = /<h4>/i
@expected_day_count = 1
@time_delimiter = ':'
end
def pre_test_cleanup(line)
if line =~ /approx /
line = strip_tags line
line = line.gsub /approx/, '(approx)'
line = "<p>#{line}</p>\n"
elsif line =~ /Disc 1/
line = "<p>12.00 - Disc 1</p>\n"
end
line
end
def is_content_to_skip(line)
case line
when /News &amp; Weather: / :
true
when /If you wish to adapt our programme schedules/ :
true
when /Programme Schedules are licensed under the Creative Commons/ :
true
when /Please identify us as author/
true
when /Waitangi Day/
true
when /<h3>/
true
when /New Zealand Music Week/i
true
else
false
end
end
def parse(html)
super(html)
end
end
# A description is a non heading line that describes the programme or its contents
def format_description(description, process_brackets=true)
description.gsub!(/<h2>/ , '')
description.gsub!(/<br[^>]*?\/>/ , '')
description.gsub!(/ RR/ , '')
description.gsub!(/((\d{1,2})(\.|:)(\d{2})) / , '<strong>\2:\4</strong> ')
if process_brackets
description.gsub!(/\(([\w|\s|\.|\,]+)\)/ , '<em>(\1)</em>')
end
description.strip
end
def check_for_smarttags(html)
html =~ /<\/o:smarttagtype>/
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment