Skip to content

Instantly share code, notes, and snippets.

@rhulse
Created May 13, 2011 21:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rhulse/971362 to your computer and use it in GitHub Desktop.
Save rhulse/971362 to your computer and use it in GitHub Desktop.
Version 3 of the parser
# This code is for illustrative purposes only and should be read in conjunction
# with this blog post:
# http://richardhulse.blogspot.com/2011/05/rebuilding-radio-nz-part-6-schedules.html
# This code is released under an MIT license (the same as Rails).
class NationalScheduleParser < Parser
def self.parse!(html)
line_has_date_regexp = /((saturday|sunday|monday|tuesday|wednesday|thursday|friday) \d{1,2}(\w{2})? \w{3,9}( \d{4})?)/i
line_is_bold_regexp = /<p><b>/i
links = Programme.find(:all)
html = clean_word_html(html)
html.gsub! /<p>12\.04 All Night Programme/, '<p><b>12.04 All Night Programme</b>'
# the hash holds data for the current week, keyed by current_date_id
week = {}
# there are also some preview and license options passed back
data = {
:preview_class => 'national',
:preview_id => 'timetable',
:upload => true,
:style_id => 'timetable',
:style_presentation => 'div',
}
current_date_id = ''
html.each_line do |line|
case line
# some things to omit
when /RADIO NEW ZEALAND NATIONAL Programme Listing/ :
next
# the date title for a page
when line_has_date_regexp :
date = Time.parse($1)
# a numerical version of the date for sorting the hash
current_date_id = date.to_i
week[current_date_id] = {}
week[current_date_id][:publish_now] = 1
week[current_date_id][:matrix_parent] = 35083
week[current_date_id][:body] ||= ''
week[current_date_id][:body] = make_day_title date
when line_is_bold_regexp :
heading = clean_and_format_heading(line, links)
week[current_date_id][:body] += heading
else
description = clean_description(line)
week[current_date_id][:body] += description
end
end
week = add_cc_license(week, {:type => 'cc-nd'})
data[:data] = week
data
end
end
class ConcertScheduleParser < Parser
def self.parse!(html)
line_has_date_regexp = /((saturday|sunday|monday|tuesday|wednesday|thursday|friday) \d{1,2}(\w{2})? \w{3,9}( \d{4})?)/i
line_is_bold_regexp = /<h3>/i
links = Programme.find(:all)
html = clean_word_html(html)
html.gsub! /<p><b>Disc 1/, '<p><b>12.00 - Disc 1'
# the hash holds data for the current week, keyed by current_date_id
week = {}
# there are also some preview and license options passed back
data = {
:preview_class => 'concert',
:preview_id => 'timetable',
:upload => true,
:style_id => 'timetable',
:style_presentation => 'div',
}
current_date_id = ''
html.each_line do |line|
case line
# the date title for a page
when line_has_date_regexp :
date = Time.parse($1)
# a numerical version of the date for sorting the hash
current_date_id = date.to_i
week[current_date_id] = {}
week[current_date_id][:publish_now] = 1
week[current_date_id][:matrix_parent] = 35103
week[current_date_id][:body] ||= ''
week[current_date_id][:body] = make_day_title date
when line_is_bold_regexp :
heading = clean_and_format_heading(line, links)
week[current_date_id][:body] += heading
else
description = clean_description(line)
week[current_date_id][:body] += description
end
end
week = add_cc_license(week, {:type => 'cc-nd'})
data[:data] = week
data
end
end
require 'rubygems'
require 'sanitize'
class Parser
def self.clean_word_html(dirty_html, elements = ['p', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'], attributes={})
email_regex = /<p>Email:\s+((\w|\-|\_|\.)+\@((\w|\-|\_)+\.)+[a-zA-Z]{2,})/i
dirty_html.gsub! /[\n|\r]/ , ' '
html = tidy(dirty_html)
# keep only the things we want.
html = Sanitize.clean(html, :elements => elements, :attributes => attributes )
#puts "==================================="
#puts html
# butt up any tags
html.gsub! /&nbsp;/ , ' '
html.gsub! />\s+</ , '><'
#remove email address lines
html.gsub! email_regex , '<p>'
# post sanitize cleanup of empty blocks
# the order of removal is import - this is the way word stacks these elements
html.gsub! /<i><\/i>/ , ''
html.gsub! /<b><\/b>/ , ''
html.gsub! /<\/b><b>/ , ''
html.gsub! /<p><\/p>/ , ''
html.gsub! /<p><b><\/b><\/p>/ , ''
# misc - fix butted times
html.gsub! /(\d)am / , '\1 am '
html.gsub! /(\d)pm / , '\1 pm '
# misc - remove multiple space that may cause doc specific regexs to fail (in dates for example)
html.gsub! /\s+/ , ' '
# add new lines at the end of lines
html.gsub! /<\/(p|h\d|dt|dd|dl)>/, '</\1>' + "\n"
html.gsub! /<dl>/ , '<dl>' + "\n"
html
end
def self.tidy(dirty_html)
error_file = File.join(RAILS_ROOT, '/log/tidy_errors.log')
tidy_options = '--word-2000 1 --indent 0 --bare 1 --wrap 0 --show-body-only 1 --drop-empty-paras 1 --force-output yes -utf8'
cleaned = nil
tidy = IO.popen("tidy -f #{error_file} #{tidy_options}", 'w+')
begin
tidy.write(dirty_html)
tidy.close_write
cleaned_html = tidy.read
tidy.close_read
rescue Errno::EPIPE
$stderr.print "Running 'tidy' failed: " + $!
tidy.close
end
return cleaned_html if cleaned_html and cleaned_html != ""
return dirty_html
end
def self.strip_tags(html, elements=[], attributes={})
html = Sanitize.clean( html, :elements => elements, :attributes => attributes)
html.strip!
html
end
# A heading is a bold or word heading style normally applied to a
# main programme name with a time at the start of the line
def self.clean_and_format_heading(heading, programmes, h_level='4' )
unless heading
return "no heading"
end
heading = strip_tags(heading)
heading.gsub!(/ RR/ , '')
heading.gsub!(/\(([\w|\s|\.|\,]+)\)/, '<strong>(\1)</strong>') # things in brackets
heading.gsub!(/((\d{2})\.(\d{2})) / , '<em>\2:\3</em> ') # times with a space after (not in a list)
heading.gsub!(/((\d{1})\.(\d{2})) / , '<em>&nbsp;\2:\3</em> ') # times with a space after (not in a list)
heading.gsub!(/((\w|\-|\_|\.)+\@((\w|\-|\_)+\.)+[a-zA-Z]{2,})/ , '') # email
heading.gsub!(/\n/ , '')
heading.strip!
programmes.each do |programme|
if programme.web_display_name and programme.web_path
heading.gsub! /(#{programme.web_display_name})/i, "<a href=\"#{programme.web_path}\">\\1</a>"
end
if programme.host and programme.host_path
end
end
"<h#{h_level}>" + heading + "</h#{h_level}>\n"
end
# A description is a non heading line that describes the programme or its contents
def self.clean_description(description, process_brackets=true)
description.gsub!(/<b>/ , '<strong>')
description.gsub!(/<i>/ , '<em>')
description.gsub!(/<\/b>/ , '</strong>')
description.gsub!(/<\/i>/ , '</em>')
description.gsub!(/ RR/ , '')
description.gsub!(/((\d{1,2})\.(\d{2})) / , '<strong>\2:\3</strong> ')
description.gsub!(/Email:\s+((\w|\-|\_|\.)+\@((\w|\-|\_)+\.)+[a-zA-Z]{2,})/i , '')
if process_brackets
description.gsub!(/\(([\w|\s|\.|\,]+)\)/ , '<em>(\1)</em>')
end
description
end
def self.make_day_title(date)
text_date = date.strftime("%A %e %B %Y")
tereo_date = convert_date_to_maori(text_date)
day_title = '<h2 class="bi"><span class="eng">' + text_date + '</span> ' +
'<span class="reo">' + tereo_date + '</span></h2>'
end
def self.convert_date_to_maori(date_string)
tereo_date = date_string.clone
h = {
/January/ => 'Kohi-t&#257;tea',
/February/ => 'Hui-tanguru',
/March/ => 'Pout&#363;-te-rangi',
/April/ => 'Paengawh&#257;-wh&#257;',
/May/ => 'Haratua',
/June/ => 'Pipiri',
/July/ => 'H&#333;ngongoi',
/August/ => 'Here-turi-k&#333;k&#257;',
/September/ => 'Mahuru',
/October/ => 'Whiringa-&#257;-nuku',
/November/ => 'Whiringa-&#257;-rangi',
/December/ => 'Hakihea',
/Monday/ => 'R&#257;hina',
/Tuesday/ => 'R&#257;t&#363;',
/Wednesday/ => 'R&#257;apa',
/Thursday/ => 'R&#257;pare',
/Friday/ => 'R&#257;mere',
/Saturday/ => 'R&#257;horoi',
/Sunday/ => 'R&#257;tapu'
}
h.each_pair do |regexp, replacement|
tereo_date.gsub! regexp, replacement
end
tereo_date
end
def self.add_cc_license(data, opts=nil)
cc = generate_cc_license_html(opts)
data.keys.each do |key|
data[key][:body] += cc
end
data
end
def self.generate_cc_license_html(type)
cc_license = '<div class="license">
<p><a href="http://creativecommons.org/licenses/by-nd/3.0/nz/" rel="license"><img src="http://i.creativecommons.org/l/by-nd/3.0/nz/88x31.png" alt="Creative Commons License"/></a><br/>Radio New Zealand\'s Programme Schedules are licensed under the <a href="http://creativecommons.org/licenses/by-nd/3.0/nz/" rel="license">Creative Commons Attribution-No Derivative Works 3.0 New Zealand License</a>.</p>
<p>Please identify us as author of the programme schedules by adding a credit to "Radio New Zealand Limited" and providing a link to our website, www.radionz.co.nz.</p>
<p>If you wish to adapt our programme schedules, please see our <a href="/legal/programme_schedules_tou">Terms of Use for Adapting Programme Schedules</a></p>
</div>'
cc_license
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment