rhulse/gist:971345

## gistfile1.rb
# This code is for illustrative purposes only and should be read in conjunction
# with this blog post:
# http://richardhulse.blogspot.com/2011/05/rebuilding-radio-nz-part-6-schedules.html

require 'open-uri'
require 'html_parser_docs'

namespace "migrate" do

  desc "Fetch the remote files"
  task :fetch_schedules => [:environment] do
    type    = ENV['type']
    start   = ENV['start']
    finish  = ENV['finish']
    path    = ''

    case type
    when 'national_schedules'
      path = 'http://www.radionz.co.nz/national/schedules/'
    when 'concert_schedules'
      path = 'http://www.radionz.co.nz/concert/schedules/'
    end


  end


  desc "Import all Radio NZ schedules"
  task :all_schedules => [:environment] do

    unless ENV['start'] || ENV['finish']
      help
      exit
    end

    n  = NationalScheduleLiveParser.new
    load_pages(Station.national, ENV['start'], ENV['finish'], 'http://www.radionz.co.nz/national/schedules/', n)

    c  = ConcertScheduleLiveParser.new
    load_pages(Station.concert, ENV['start'], ENV['finish'], 'http://www.radionz.co.nz/concert/schedules/', c)

  end

  desc "Import all schedules from radionz.co.nz/national/schedules"
  task :live_national_schedules => [:environment] do

    p  = NationalScheduleLiveParser.new

    unless ENV['start'] || ENV['finish']
      help
      exit
    end

    load_pages(Station.national, ENV['start'], ENV['finish'], 'http://www.radionz.co.nz/national/schedules/', p)

  end


  desc "Import all schedules from radionz.co.nz/concert/schedules"
  task :live_concert_schedules => [:environment] do

    p  = ConcertScheduleLiveParser.new

    unless ENV['start'] || ENV['finish']
      help
      exit
    end

    load_pages(Station.concert, ENV['start'], ENV['finish'], 'http://www.radionz.co.nz/concert/schedules/', p)

  end
end

def load_pages(station, start, finish, base_url, parser)

  start = Time.parse(start)

  finish = Time.parse(finish)
  # reset this inside the model for times when this function is called
  # several times for different station in a bulk import
  ScheduleEvent.programmes = nil
  puts "Starting at #{start}"
  puts "Finishing at #{finish}"
  days = ((finish - start) / 60 / 60 / 24) + 1

  days.to_i.times do |index|
    events = []
    date = (start + index.days).strftime('%Y%m%d')
    url = "#{base_url}#{date}"

    doc = Nokogiri::HTML(get_url(url))

    puts "====================="
    puts "URL   => #{url}"

    title = doc.at_css("title").text
    title = title.to_s.gsub!(/\s+/,' ')

    puts "TITLE => #{title}"

    html = doc.at_css("#timetable")

    # remove the CC licnese div from the content as
    # it cannot be line parsed correctly due to a line break
    # after the <br>
    # this is way more elegant anyway....
    html.xpath('//div[@class = "license"]').each do |node|
      node.remove
    end

    events = parser.parse(html.to_s)
    events.sort.each do |index, item|
      item[:starts_at] = Time.at(index) #.strftime('%Y-%m-%d %H:%M')
      puts "======="
      puts item[:starts_at]
      puts item[:title]
      puts item[:body]
      ScheduleEvent.create_from_load(item, station)
    end
  end
end

=begin
Get a url, or use a cached copy of it
We cache to speed things up during testing and reduce load
on the site during actual import
=end

def get_url(url)
  html = ''
  cache_file_name = url.gsub(/http:\/\//, '').gsub(/\//, '-') + '.html'

  cache_file = File.join(Rails.root, 'content_to_migrate/cache', cache_file_name )
  if FileTest.exist?(cache_file)
    html = File.read(cache_file)
  else
    html = open(url).read
    File.open(cache_file, 'w') {|f| f.write(html) }
  end
  html
end

def help
  puts "import:live_schedules - you have missing or invalid options
usage:
   rake import:live_schedules start=yyyymmdd end=yyyymmdd"
end
	# This code is for illustrative purposes only and should be read in conjunction
	# with this blog post:
	# http://richardhulse.blogspot.com/2011/05/rebuilding-radio-nz-part-6-schedules.html

	require 'open-uri'
	require 'html_parser_docs'

	namespace "migrate" do

	desc "Fetch the remote files"
	task :fetch_schedules => [:environment] do
	type = ENV['type']
	start = ENV['start']
	finish = ENV['finish']
	path = ''

	case type
	when 'national_schedules'
	path = 'http://www.radionz.co.nz/national/schedules/'
	when 'concert_schedules'
	path = 'http://www.radionz.co.nz/concert/schedules/'
	end


	end


	desc "Import all Radio NZ schedules"
	task :all_schedules => [:environment] do

	unless ENV['start'] \|\| ENV['finish']
	help
	exit
	end

	n = NationalScheduleLiveParser.new
	load_pages(Station.national, ENV['start'], ENV['finish'], 'http://www.radionz.co.nz/national/schedules/', n)

	c = ConcertScheduleLiveParser.new
	load_pages(Station.concert, ENV['start'], ENV['finish'], 'http://www.radionz.co.nz/concert/schedules/', c)

	end

	desc "Import all schedules from radionz.co.nz/national/schedules"
	task :live_national_schedules => [:environment] do

	p = NationalScheduleLiveParser.new

	unless ENV['start'] \|\| ENV['finish']
	help
	exit
	end

	load_pages(Station.national, ENV['start'], ENV['finish'], 'http://www.radionz.co.nz/national/schedules/', p)

	end


	desc "Import all schedules from radionz.co.nz/concert/schedules"
	task :live_concert_schedules => [:environment] do

	p = ConcertScheduleLiveParser.new

	unless ENV['start'] \|\| ENV['finish']
	help
	exit
	end

	load_pages(Station.concert, ENV['start'], ENV['finish'], 'http://www.radionz.co.nz/concert/schedules/', p)

	end
	end

	def load_pages(station, start, finish, base_url, parser)

	start = Time.parse(start)

	finish = Time.parse(finish)
	# reset this inside the model for times when this function is called
	# several times for different station in a bulk import
	ScheduleEvent.programmes = nil
	puts "Starting at #{start}"
	puts "Finishing at #{finish}"
	days = ((finish - start) / 60 / 60 / 24) + 1

	days.to_i.times do \|index\|
	events = []
	date = (start + index.days).strftime('%Y%m%d')
	url = "#{base_url}#{date}"

	doc = Nokogiri::HTML(get_url(url))

	puts "====================="
	puts "URL => #{url}"

	title = doc.at_css("title").text
	title = title.to_s.gsub!(/\s+/,' ')

	puts "TITLE => #{title}"

	html = doc.at_css("#timetable")

	# remove the CC licnese div from the content as
	# it cannot be line parsed correctly due to a line break
	# after the <br>
	# this is way more elegant anyway....
	html.xpath('//div[@class = "license"]').each do \|node\|
	node.remove
	end

	events = parser.parse(html.to_s)
	events.sort.each do \|index, item\|
	item[:starts_at] = Time.at(index) #.strftime('%Y-%m-%d %H:%M')
	puts "======="
	puts item[:starts_at]
	puts item[:title]
	puts item[:body]
	ScheduleEvent.create_from_load(item, station)
	end
	end
	end

	=begin
	Get a url, or use a cached copy of it
	We cache to speed things up during testing and reduce load
	on the site during actual import
	=end

	def get_url(url)
	html = ''
	cache_file_name = url.gsub(/http:\/\//, '').gsub(/\//, '-') + '.html'

	cache_file = File.join(Rails.root, 'content_to_migrate/cache', cache_file_name )
	if FileTest.exist?(cache_file)
	html = File.read(cache_file)
	else
	html = open(url).read
	File.open(cache_file, 'w') {\|f\| f.write(html) }
	end
	html
	end

	def help
	puts "import:live_schedules - you have missing or invalid options
	usage:
	rake import:live_schedules start=yyyymmdd end=yyyymmdd"
	end