Version 3 of the parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This code is for illustrative purposes only and should be read in conjunction | |
# with this blog post: | |
# http://richardhulse.blogspot.com/2011/05/rebuilding-radio-nz-part-6-schedules.html | |
# This code is released under an MIT license (the same as Rails). | |
class NationalScheduleParser < Parser | |
def self.parse!(html) | |
line_has_date_regexp = /((saturday|sunday|monday|tuesday|wednesday|thursday|friday) \d{1,2}(\w{2})? \w{3,9}( \d{4})?)/i | |
line_is_bold_regexp = /<p><b>/i | |
links = Programme.find(:all) | |
html = clean_word_html(html) | |
html.gsub! /<p>12\.04 All Night Programme/, '<p><b>12.04 All Night Programme</b>' | |
# the hash holds data for the current week, keyed by current_date_id | |
week = {} | |
# there are also some preview and license options passed back | |
data = { | |
:preview_class => 'national', | |
:preview_id => 'timetable', | |
:upload => true, | |
:style_id => 'timetable', | |
:style_presentation => 'div', | |
} | |
current_date_id = '' | |
html.each_line do |line| | |
case line | |
# some things to omit | |
when /RADIO NEW ZEALAND NATIONAL Programme Listing/ : | |
next | |
# the date title for a page | |
when line_has_date_regexp : | |
date = Time.parse($1) | |
# a numerical version of the date for sorting the hash | |
current_date_id = date.to_i | |
week[current_date_id] = {} | |
week[current_date_id][:publish_now] = 1 | |
week[current_date_id][:matrix_parent] = 35083 | |
week[current_date_id][:body] ||= '' | |
week[current_date_id][:body] = make_day_title date | |
when line_is_bold_regexp : | |
heading = clean_and_format_heading(line, links) | |
week[current_date_id][:body] += heading | |
else | |
description = clean_description(line) | |
week[current_date_id][:body] += description | |
end | |
end | |
week = add_cc_license(week, {:type => 'cc-nd'}) | |
data[:data] = week | |
data | |
end | |
end | |
class ConcertScheduleParser < Parser | |
def self.parse!(html) | |
line_has_date_regexp = /((saturday|sunday|monday|tuesday|wednesday|thursday|friday) \d{1,2}(\w{2})? \w{3,9}( \d{4})?)/i | |
line_is_bold_regexp = /<h3>/i | |
links = Programme.find(:all) | |
html = clean_word_html(html) | |
html.gsub! /<p><b>Disc 1/, '<p><b>12.00 - Disc 1' | |
# the hash holds data for the current week, keyed by current_date_id | |
week = {} | |
# there are also some preview and license options passed back | |
data = { | |
:preview_class => 'concert', | |
:preview_id => 'timetable', | |
:upload => true, | |
:style_id => 'timetable', | |
:style_presentation => 'div', | |
} | |
current_date_id = '' | |
html.each_line do |line| | |
case line | |
# the date title for a page | |
when line_has_date_regexp : | |
date = Time.parse($1) | |
# a numerical version of the date for sorting the hash | |
current_date_id = date.to_i | |
week[current_date_id] = {} | |
week[current_date_id][:publish_now] = 1 | |
week[current_date_id][:matrix_parent] = 35103 | |
week[current_date_id][:body] ||= '' | |
week[current_date_id][:body] = make_day_title date | |
when line_is_bold_regexp : | |
heading = clean_and_format_heading(line, links) | |
week[current_date_id][:body] += heading | |
else | |
description = clean_description(line) | |
week[current_date_id][:body] += description | |
end | |
end | |
week = add_cc_license(week, {:type => 'cc-nd'}) | |
data[:data] = week | |
data | |
end | |
end | |
require 'rubygems' | |
require 'sanitize' | |
class Parser | |
def self.clean_word_html(dirty_html, elements = ['p', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'], attributes={}) | |
email_regex = /<p>Email:\s+((\w|\-|\_|\.)+\@((\w|\-|\_)+\.)+[a-zA-Z]{2,})/i | |
dirty_html.gsub! /[\n|\r]/ , ' ' | |
html = tidy(dirty_html) | |
# keep only the things we want. | |
html = Sanitize.clean(html, :elements => elements, :attributes => attributes ) | |
#puts "===================================" | |
#puts html | |
# butt up any tags | |
html.gsub! / / , ' ' | |
html.gsub! />\s+</ , '><' | |
#remove email address lines | |
html.gsub! email_regex , '<p>' | |
# post sanitize cleanup of empty blocks | |
# the order of removal is import - this is the way word stacks these elements | |
html.gsub! /<i><\/i>/ , '' | |
html.gsub! /<b><\/b>/ , '' | |
html.gsub! /<\/b><b>/ , '' | |
html.gsub! /<p><\/p>/ , '' | |
html.gsub! /<p><b><\/b><\/p>/ , '' | |
# misc - fix butted times | |
html.gsub! /(\d)am / , '\1 am ' | |
html.gsub! /(\d)pm / , '\1 pm ' | |
# misc - remove multiple space that may cause doc specific regexs to fail (in dates for example) | |
html.gsub! /\s+/ , ' ' | |
# add new lines at the end of lines | |
html.gsub! /<\/(p|h\d|dt|dd|dl)>/, '</\1>' + "\n" | |
html.gsub! /<dl>/ , '<dl>' + "\n" | |
html | |
end | |
def self.tidy(dirty_html) | |
error_file = File.join(RAILS_ROOT, '/log/tidy_errors.log') | |
tidy_options = '--word-2000 1 --indent 0 --bare 1 --wrap 0 --show-body-only 1 --drop-empty-paras 1 --force-output yes -utf8' | |
cleaned = nil | |
tidy = IO.popen("tidy -f #{error_file} #{tidy_options}", 'w+') | |
begin | |
tidy.write(dirty_html) | |
tidy.close_write | |
cleaned_html = tidy.read | |
tidy.close_read | |
rescue Errno::EPIPE | |
$stderr.print "Running 'tidy' failed: " + $! | |
tidy.close | |
end | |
return cleaned_html if cleaned_html and cleaned_html != "" | |
return dirty_html | |
end | |
def self.strip_tags(html, elements=[], attributes={}) | |
html = Sanitize.clean( html, :elements => elements, :attributes => attributes) | |
html.strip! | |
html | |
end | |
# A heading is a bold or word heading style normally applied to a | |
# main programme name with a time at the start of the line | |
def self.clean_and_format_heading(heading, programmes, h_level='4' ) | |
unless heading | |
return "no heading" | |
end | |
heading = strip_tags(heading) | |
heading.gsub!(/ RR/ , '') | |
heading.gsub!(/\(([\w|\s|\.|\,]+)\)/, '<strong>(\1)</strong>') # things in brackets | |
heading.gsub!(/((\d{2})\.(\d{2})) / , '<em>\2:\3</em> ') # times with a space after (not in a list) | |
heading.gsub!(/((\d{1})\.(\d{2})) / , '<em> \2:\3</em> ') # times with a space after (not in a list) | |
heading.gsub!(/((\w|\-|\_|\.)+\@((\w|\-|\_)+\.)+[a-zA-Z]{2,})/ , '') # email | |
heading.gsub!(/\n/ , '') | |
heading.strip! | |
programmes.each do |programme| | |
if programme.web_display_name and programme.web_path | |
heading.gsub! /(#{programme.web_display_name})/i, "<a href=\"#{programme.web_path}\">\\1</a>" | |
end | |
if programme.host and programme.host_path | |
end | |
end | |
"<h#{h_level}>" + heading + "</h#{h_level}>\n" | |
end | |
# A description is a non heading line that describes the programme or its contents | |
def self.clean_description(description, process_brackets=true) | |
description.gsub!(/<b>/ , '<strong>') | |
description.gsub!(/<i>/ , '<em>') | |
description.gsub!(/<\/b>/ , '</strong>') | |
description.gsub!(/<\/i>/ , '</em>') | |
description.gsub!(/ RR/ , '') | |
description.gsub!(/((\d{1,2})\.(\d{2})) / , '<strong>\2:\3</strong> ') | |
description.gsub!(/Email:\s+((\w|\-|\_|\.)+\@((\w|\-|\_)+\.)+[a-zA-Z]{2,})/i , '') | |
if process_brackets | |
description.gsub!(/\(([\w|\s|\.|\,]+)\)/ , '<em>(\1)</em>') | |
end | |
description | |
end | |
def self.make_day_title(date) | |
text_date = date.strftime("%A %e %B %Y") | |
tereo_date = convert_date_to_maori(text_date) | |
day_title = '<h2 class="bi"><span class="eng">' + text_date + '</span> ' + | |
'<span class="reo">' + tereo_date + '</span></h2>' | |
end | |
def self.convert_date_to_maori(date_string) | |
tereo_date = date_string.clone | |
h = { | |
/January/ => 'Kohi-tātea', | |
/February/ => 'Hui-tanguru', | |
/March/ => 'Poutū-te-rangi', | |
/April/ => 'Paengawhā-whā', | |
/May/ => 'Haratua', | |
/June/ => 'Pipiri', | |
/July/ => 'Hōngongoi', | |
/August/ => 'Here-turi-kōkā', | |
/September/ => 'Mahuru', | |
/October/ => 'Whiringa-ā-nuku', | |
/November/ => 'Whiringa-ā-rangi', | |
/December/ => 'Hakihea', | |
/Monday/ => 'Rāhina', | |
/Tuesday/ => 'Rātū', | |
/Wednesday/ => 'Rāapa', | |
/Thursday/ => 'Rāpare', | |
/Friday/ => 'Rāmere', | |
/Saturday/ => 'Rāhoroi', | |
/Sunday/ => 'Rātapu' | |
} | |
h.each_pair do |regexp, replacement| | |
tereo_date.gsub! regexp, replacement | |
end | |
tereo_date | |
end | |
def self.add_cc_license(data, opts=nil) | |
cc = generate_cc_license_html(opts) | |
data.keys.each do |key| | |
data[key][:body] += cc | |
end | |
data | |
end | |
def self.generate_cc_license_html(type) | |
cc_license = '<div class="license"> | |
<p><a href="http://creativecommons.org/licenses/by-nd/3.0/nz/" rel="license"><img src="http://i.creativecommons.org/l/by-nd/3.0/nz/88x31.png" alt="Creative Commons License"/></a><br/>Radio New Zealand\'s Programme Schedules are licensed under the <a href="http://creativecommons.org/licenses/by-nd/3.0/nz/" rel="license">Creative Commons Attribution-No Derivative Works 3.0 New Zealand License</a>.</p> | |
<p>Please identify us as author of the programme schedules by adding a credit to "Radio New Zealand Limited" and providing a link to our website, www.radionz.co.nz.</p> | |
<p>If you wish to adapt our programme schedules, please see our <a href="/legal/programme_schedules_tou">Terms of Use for Adapting Programme Schedules</a></p> | |
</div>' | |
cc_license | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment