rhulse/gist:971362

## gistfile1.rb
# This code is for illustrative purposes only and should be read in conjunction
# with this blog post:
# http://richardhulse.blogspot.com/2011/05/rebuilding-radio-nz-part-6-schedules.html

# This code is released under an MIT license (the same as Rails).


class NationalScheduleParser < Parser

  def self.parse!(html)
    line_has_date_regexp = /((saturday|sunday|monday|tuesday|wednesday|thursday|friday) \d{1,2}(\w{2})? \w{3,9}( \d{4})?)/i
    line_is_bold_regexp = /<p><b>/i

    links = Programme.find(:all)

    html = clean_word_html(html)

    html.gsub! /<p>12\.04 All Night Programme/, '<p><b>12.04 All Night Programme</b>'

    # the hash holds data for the current week, keyed by current_date_id
    week = {}

    # there are also some preview and license options passed back
    data = {
      :preview_class      => 'national',
      :preview_id         => 'timetable',
      :upload         => true,
      :style_id       => 'timetable',
      :style_presentation => 'div',
    }

    current_date_id = ''

    html.each_line do |line|
      case line
        # some things to omit
        when /RADIO NEW ZEALAND NATIONAL Programme Listing/ :
          next

        # the date title for a page
        when line_has_date_regexp :
          date = Time.parse($1)

          # a numerical version of the date for sorting the hash
          current_date_id = date.to_i
          week[current_date_id] = {}
          week[current_date_id][:publish_now] = 1
          week[current_date_id][:matrix_parent] = 35083
          week[current_date_id][:body] ||= ''
          week[current_date_id][:body] = make_day_title date

        when line_is_bold_regexp :
          heading = clean_and_format_heading(line, links)
          week[current_date_id][:body] += heading

        else
          description = clean_description(line)
          week[current_date_id][:body] += description
      end

    end
    week = add_cc_license(week, {:type => 'cc-nd'})

    data[:data] = week

    data
  end
end


class ConcertScheduleParser < Parser

  def self.parse!(html)
    line_has_date_regexp = /((saturday|sunday|monday|tuesday|wednesday|thursday|friday) \d{1,2}(\w{2})? \w{3,9}( \d{4})?)/i
    line_is_bold_regexp = /<h3>/i

    links = Programme.find(:all)

    html = clean_word_html(html)

    html.gsub! /<p><b>Disc 1/, '<p><b>12.00 - Disc 1'

    # the hash holds data for the current week, keyed by current_date_id
    week = {}

    # there are also some preview and license options passed back
    data = {
      :preview_class      => 'concert',
      :preview_id         => 'timetable',
      :upload         => true,
      :style_id       => 'timetable',
      :style_presentation => 'div',
    }

    current_date_id = ''

    html.each_line do |line|
      case line
        # the date title for a page
        when line_has_date_regexp :
          date = Time.parse($1)

          # a numerical version of the date for sorting the hash
          current_date_id = date.to_i
          week[current_date_id] = {}
          week[current_date_id][:publish_now] = 1
          week[current_date_id][:matrix_parent] = 35103
          week[current_date_id][:body] ||= ''
          week[current_date_id][:body] = make_day_title date

        when line_is_bold_regexp :
          heading = clean_and_format_heading(line, links)
          week[current_date_id][:body] += heading

        else
          description = clean_description(line)
          week[current_date_id][:body] += description
      end

    end

    week = add_cc_license(week, {:type => 'cc-nd'})

    data[:data] = week

    data
  end
end


require 'rubygems'
require 'sanitize'

class Parser

  def self.clean_word_html(dirty_html, elements = ['p', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'], attributes={})

    email_regex = /<p>Email:\s+((\w|\-|\_|\.)+\@((\w|\-|\_)+\.)+[a-zA-Z]{2,})/i

    dirty_html.gsub! /[\n|\r]/    , ' '

    html = tidy(dirty_html)
    # keep only the things we want.
    html = Sanitize.clean(html, :elements => elements, :attributes => attributes )
    #puts "==================================="
    #puts html
    # butt up any tags
    html.gsub! /&nbsp;/                 , ' '
    html.gsub! />\s+</                  , '><'

    #remove email address lines
    html.gsub! email_regex              , '<p>'

    # post sanitize cleanup of empty blocks
    # the order of removal is import - this is the way word stacks these elements
    html.gsub! /<i><\/i>/               , ''
    html.gsub! /<b><\/b>/               , ''
    html.gsub! /<\/b><b>/               , ''
    html.gsub! /<p><\/p>/               , ''
    html.gsub! /<p><b><\/b><\/p>/       , ''

    # misc - fix butted times
    html.gsub! /(\d)am /          , '\1 am '
    html.gsub! /(\d)pm /          , '\1 pm '
    # misc - remove multiple space that may cause doc specific regexs to fail (in dates for example)
    html.gsub! /\s+/                  , ' '

    # add new lines at the end of lines
    html.gsub! /<\/(p|h\d|dt|dd|dl)>/, '</\1>' + "\n"
    html.gsub! /<dl>/             , '<dl>' + "\n"

    html
  end

  def self.tidy(dirty_html)
    error_file = File.join(RAILS_ROOT, '/log/tidy_errors.log')
    tidy_options = '--word-2000 1 --indent 0 --bare 1 --wrap 0 --show-body-only 1 --drop-empty-paras 1 --force-output yes -utf8'
    cleaned = nil
    tidy = IO.popen("tidy -f #{error_file} #{tidy_options}", 'w+')
    begin
        tidy.write(dirty_html)
        tidy.close_write
        cleaned_html = tidy.read
        tidy.close_read
    rescue Errno::EPIPE
        $stderr.print "Running 'tidy' failed: " + $!
        tidy.close
    end
    return cleaned_html if cleaned_html and cleaned_html != ""
    return dirty_html
  end

  def self.strip_tags(html, elements=[], attributes={})
    html = Sanitize.clean( html, :elements => elements, :attributes => attributes)
    html.strip!

    html
  end

  # A heading is a bold or word heading style normally applied to a
  # main programme name with a time at the start of the line
  def self.clean_and_format_heading(heading, programmes, h_level='4' )
    unless heading
      return "no heading"
    end
    heading = strip_tags(heading)
    heading.gsub!(/ RR/ , '')
    heading.gsub!(/\(([\w|\s|\.|\,]+)\)/, '<strong>(\1)</strong>') # things in brackets
    heading.gsub!(/((\d{2})\.(\d{2})) / , '<em>\2:\3</em> ') # times with a space after (not in a list)
    heading.gsub!(/((\d{1})\.(\d{2})) / , '<em>&nbsp;\2:\3</em> ')  # times with a space after (not in a list)
    heading.gsub!(/((\w|\-|\_|\.)+\@((\w|\-|\_)+\.)+[a-zA-Z]{2,})/  , '') # email
    heading.gsub!(/\n/  , '')
    heading.strip!

    programmes.each do |programme|
      if programme.web_display_name and programme.web_path
        heading.gsub! /(#{programme.web_display_name})/i, "<a href=\"#{programme.web_path}\">\\1</a>"
      end

      if programme.host and programme.host_path

      end
    end

    "<h#{h_level}>" + heading + "</h#{h_level}>\n"
  end

  # A description is a non heading line that describes the programme or its contents
  def self.clean_description(description, process_brackets=true)
    description.gsub!(/<b>/ , '<strong>')
    description.gsub!(/<i>/ , '<em>')
    description.gsub!(/<\/b>/ , '</strong>')
    description.gsub!(/<\/i>/ , '</em>')
    description.gsub!(/ RR/ , '')
    description.gsub!(/((\d{1,2})\.(\d{2})) / , '<strong>\2:\3</strong> ')
    description.gsub!(/Email:\s+((\w|\-|\_|\.)+\@((\w|\-|\_)+\.)+[a-zA-Z]{2,})/i , '')
    if process_brackets
      description.gsub!(/\(([\w|\s|\.|\,]+)\)/ , '<em>(\1)</em>')
    end

    description
  end

  def self.make_day_title(date)
    text_date = date.strftime("%A %e %B %Y")
    tereo_date = convert_date_to_maori(text_date)

    day_title =  '<h2 class="bi"><span class="eng">' + text_date + '</span> ' +
              '<span class="reo">' + tereo_date + '</span></h2>'

  end

  def self.convert_date_to_maori(date_string)
    tereo_date = date_string.clone
    h = {
          /January/   => 'Kohi-t&#257;tea',
          /February/  => 'Hui-tanguru',
          /March/     => 'Pout&#363;-te-rangi',
          /April/     => 'Paengawh&#257;-wh&#257;',
          /May/       => 'Haratua',
          /June/      => 'Pipiri',
          /July/      => 'H&#333;ngongoi',
          /August/    => 'Here-turi-k&#333;k&#257;',
          /September/ => 'Mahuru',
          /October/   => 'Whiringa-&#257;-nuku',
          /November/  => 'Whiringa-&#257;-rangi',
          /December/  => 'Hakihea',
          /Monday/    => 'R&#257;hina',
          /Tuesday/   => 'R&#257;t&#363;',
          /Wednesday/ => 'R&#257;apa',
          /Thursday/  => 'R&#257;pare',
          /Friday/    => 'R&#257;mere',
          /Saturday/  => 'R&#257;horoi',
          /Sunday/    => 'R&#257;tapu'
        }

    h.each_pair do |regexp, replacement|
       tereo_date.gsub! regexp, replacement
    end

    tereo_date
  end

  def self.add_cc_license(data, opts=nil)
    cc = generate_cc_license_html(opts)
    data.keys.each do |key|
      data[key][:body] += cc
    end
    data
  end

  def self.generate_cc_license_html(type)
    cc_license = '<div class="license">
    <p><a href="http://creativecommons.org/licenses/by-nd/3.0/nz/" rel="license"><img src="http://i.creativecommons.org/l/by-nd/3.0/nz/88x31.png" alt="Creative Commons License"/></a><br/>Radio New Zealand\'s Programme Schedules are licensed under the  <a href="http://creativecommons.org/licenses/by-nd/3.0/nz/" rel="license">Creative Commons Attribution-No Derivative Works 3.0 New Zealand License</a>.</p>
    <p>Please identify us as author of the programme schedules by adding a credit to "Radio New Zealand Limited" and providing a link to our website, www.radionz.co.nz.</p>
    <p>If you wish to adapt our programme schedules, please see our <a href="/legal/programme_schedules_tou">Terms of Use for Adapting Programme Schedules</a></p>
    </div>'

    cc_license
  end
end
	# This code is for illustrative purposes only and should be read in conjunction
	# with this blog post:
	# http://richardhulse.blogspot.com/2011/05/rebuilding-radio-nz-part-6-schedules.html

	# This code is released under an MIT license (the same as Rails).


	class NationalScheduleParser < Parser

	def self.parse!(html)
	line_has_date_regexp = /((saturday\|sunday\|monday\|tuesday\|wednesday\|thursday\|friday) \d{1,2}(\w{2})? \w{3,9}( \d{4})?)/i
	line_is_bold_regexp = /<p><b>/i

	links = Programme.find(:all)

	html = clean_word_html(html)

	html.gsub! /<p>12\.04 All Night Programme/, '<p><b>12.04 All Night Programme</b>'

	# the hash holds data for the current week, keyed by current_date_id
	week = {}

	# there are also some preview and license options passed back
	data = {
	:preview_class => 'national',
	:preview_id => 'timetable',
	:upload => true,
	:style_id => 'timetable',
	:style_presentation => 'div',
	}

	current_date_id = ''

	html.each_line do \|line\|
	case line
	# some things to omit
	when /RADIO NEW ZEALAND NATIONAL Programme Listing/ :
	next

	# the date title for a page
	when line_has_date_regexp :
	date = Time.parse($1)

	# a numerical version of the date for sorting the hash
	current_date_id = date.to_i
	week[current_date_id] = {}
	week[current_date_id][:publish_now] = 1
	week[current_date_id][:matrix_parent] = 35083
	week[current_date_id][:body] \|\|= ''
	week[current_date_id][:body] = make_day_title date

	when line_is_bold_regexp :
	heading = clean_and_format_heading(line, links)
	week[current_date_id][:body] += heading

	else
	description = clean_description(line)
	week[current_date_id][:body] += description
	end

	end
	week = add_cc_license(week, {:type => 'cc-nd'})

	data[:data] = week

	data
	end
	end


	class ConcertScheduleParser < Parser

	def self.parse!(html)
	line_has_date_regexp = /((saturday\|sunday\|monday\|tuesday\|wednesday\|thursday\|friday) \d{1,2}(\w{2})? \w{3,9}( \d{4})?)/i
	line_is_bold_regexp = /<h3>/i

	links = Programme.find(:all)

	html = clean_word_html(html)

	html.gsub! /<p><b>Disc 1/, '<p><b>12.00 - Disc 1'

	# the hash holds data for the current week, keyed by current_date_id
	week = {}

	# there are also some preview and license options passed back
	data = {
	:preview_class => 'concert',
	:preview_id => 'timetable',
	:upload => true,
	:style_id => 'timetable',
	:style_presentation => 'div',
	}

	current_date_id = ''

	html.each_line do \|line\|
	case line
	# the date title for a page
	when line_has_date_regexp :
	date = Time.parse($1)

	# a numerical version of the date for sorting the hash
	current_date_id = date.to_i
	week[current_date_id] = {}
	week[current_date_id][:publish_now] = 1
	week[current_date_id][:matrix_parent] = 35103
	week[current_date_id][:body] \|\|= ''
	week[current_date_id][:body] = make_day_title date

	when line_is_bold_regexp :
	heading = clean_and_format_heading(line, links)
	week[current_date_id][:body] += heading

	else
	description = clean_description(line)
	week[current_date_id][:body] += description
	end

	end

	week = add_cc_license(week, {:type => 'cc-nd'})

	data[:data] = week

	data
	end
	end


	require 'rubygems'
	require 'sanitize'

	class Parser

	def self.clean_word_html(dirty_html, elements = ['p', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'], attributes={})

	email_regex = /<p>Email:\s+((\w\|\-\|\_\|\.)+\@((\w\|\-\|\_)+\.)+[a-zA-Z]{2,})/i

	dirty_html.gsub! /[\n\|\r]/ , ' '

	html = tidy(dirty_html)
	# keep only the things we want.
	html = Sanitize.clean(html, :elements => elements, :attributes => attributes )
	#puts "==================================="
	#puts html
	# butt up any tags
	html.gsub! / / , ' '
	html.gsub! />\s+</ , '><'

	#remove email address lines
	html.gsub! email_regex , '<p>'

	# post sanitize cleanup of empty blocks
	# the order of removal is import - this is the way word stacks these elements
	html.gsub! /<i><\/i>/ , ''
	html.gsub! /<b><\/b>/ , ''
	html.gsub! /<\/b><b>/ , ''
	html.gsub! /<p><\/p>/ , ''
	html.gsub! /<p><b><\/b><\/p>/ , ''

	# misc - fix butted times
	html.gsub! /(\d)am / , '\1 am '
	html.gsub! /(\d)pm / , '\1 pm '
	# misc - remove multiple space that may cause doc specific regexs to fail (in dates for example)
	html.gsub! /\s+/ , ' '

	# add new lines at the end of lines
	html.gsub! /<\/(p\|h\d\|dt\|dd\|dl)>/, '</\1>' + "\n"
	html.gsub! /<dl>/ , '<dl>' + "\n"

	html
	end

	def self.tidy(dirty_html)
	error_file = File.join(RAILS_ROOT, '/log/tidy_errors.log')
	tidy_options = '--word-2000 1 --indent 0 --bare 1 --wrap 0 --show-body-only 1 --drop-empty-paras 1 --force-output yes -utf8'
	cleaned = nil
	tidy = IO.popen("tidy -f #{error_file} #{tidy_options}", 'w+')
	begin
	tidy.write(dirty_html)
	tidy.close_write
	cleaned_html = tidy.read
	tidy.close_read
	rescue Errno::EPIPE
	$stderr.print "Running 'tidy' failed: " + $!
	tidy.close
	end
	return cleaned_html if cleaned_html and cleaned_html != ""
	return dirty_html
	end

	def self.strip_tags(html, elements=[], attributes={})
	html = Sanitize.clean( html, :elements => elements, :attributes => attributes)
	html.strip!

	html
	end

	# A heading is a bold or word heading style normally applied to a
	# main programme name with a time at the start of the line
	def self.clean_and_format_heading(heading, programmes, h_level='4' )
	unless heading
	return "no heading"
	end
	heading = strip_tags(heading)
	heading.gsub!(/ RR/ , '')
	heading.gsub!(/\(([\w\|\s\|\.\|\,]+)\)/, '<strong>(\1)</strong>') # things in brackets
	heading.gsub!(/((\d{2})\.(\d{2})) / , '<em>\2:\3</em> ') # times with a space after (not in a list)
	heading.gsub!(/((\d{1})\.(\d{2})) / , '<em> \2:\3</em> ') # times with a space after (not in a list)
	heading.gsub!(/((\w\|\-\|\_\|\.)+\@((\w\|\-\|\_)+\.)+[a-zA-Z]{2,})/ , '') # email
	heading.gsub!(/\n/ , '')
	heading.strip!

	programmes.each do \|programme\|
	if programme.web_display_name and programme.web_path
	heading.gsub! /(#{programme.web_display_name})/i, "<a href=\"#{programme.web_path}\">\\1</a>"
	end

	if programme.host and programme.host_path

	end
	end

	"<h#{h_level}>" + heading + "</h#{h_level}>\n"
	end

	# A description is a non heading line that describes the programme or its contents
	def self.clean_description(description, process_brackets=true)
	description.gsub!(/<b>/ , '<strong>')
	description.gsub!(/<i>/ , '<em>')
	description.gsub!(/<\/b>/ , '</strong>')
	description.gsub!(/<\/i>/ , '</em>')
	description.gsub!(/ RR/ , '')
	description.gsub!(/((\d{1,2})\.(\d{2})) / , '<strong>\2:\3</strong> ')
	description.gsub!(/Email:\s+((\w\|\-\|\_\|\.)+\@((\w\|\-\|\_)+\.)+[a-zA-Z]{2,})/i , '')
	if process_brackets
	description.gsub!(/\(([\w\|\s\|\.\|\,]+)\)/ , '<em>(\1)</em>')
	end

	description
	end

	def self.make_day_title(date)
	text_date = date.strftime("%A %e %B %Y")
	tereo_date = convert_date_to_maori(text_date)

	day_title = '<h2 class="bi"><span class="eng">' + text_date + '</span> ' +
	'<span class="reo">' + tereo_date + '</span></h2>'

	end

	def self.convert_date_to_maori(date_string)
	tereo_date = date_string.clone
	h = {
	/January/ => 'Kohi-tātea',
	/February/ => 'Hui-tanguru',
	/March/ => 'Poutū-te-rangi',
	/April/ => 'Paengawhā-whā',
	/May/ => 'Haratua',
	/June/ => 'Pipiri',
	/July/ => 'Hōngongoi',
	/August/ => 'Here-turi-kōkā',
	/September/ => 'Mahuru',
	/October/ => 'Whiringa-ā-nuku',
	/November/ => 'Whiringa-ā-rangi',
	/December/ => 'Hakihea',
	/Monday/ => 'Rāhina',
	/Tuesday/ => 'Rātū',
	/Wednesday/ => 'Rāapa',
	/Thursday/ => 'Rāpare',
	/Friday/ => 'Rāmere',
	/Saturday/ => 'Rāhoroi',
	/Sunday/ => 'Rātapu'
	}

	h.each_pair do \|regexp, replacement\|
	tereo_date.gsub! regexp, replacement
	end

	tereo_date
	end

	def self.add_cc_license(data, opts=nil)
	cc = generate_cc_license_html(opts)
	data.keys.each do \|key\|
	data[key][:body] += cc
	end
	data
	end

	def self.generate_cc_license_html(type)
	cc_license = '<div class="license">
	<p><a href="http://creativecommons.org/licenses/by-nd/3.0/nz/" rel="license"><img src="http://i.creativecommons.org/l/by-nd/3.0/nz/88x31.png" alt="Creative Commons License"/></a><br/>Radio New Zealand\'s Programme Schedules are licensed under the <a href="http://creativecommons.org/licenses/by-nd/3.0/nz/" rel="license">Creative Commons Attribution-No Derivative Works 3.0 New Zealand License</a>.</p>
	<p>Please identify us as author of the programme schedules by adding a credit to "Radio New Zealand Limited" and providing a link to our website, www.radionz.co.nz.</p>
	<p>If you wish to adapt our programme schedules, please see our <a href="/legal/programme_schedules_tou">Terms of Use for Adapting Programme Schedules</a></p>
	</div>'

	cc_license
	end
	end