ELF Schedule Importer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This code is for illustrative purposes only and should be read in conjunction | |
# with this blog post: | |
# http://richardhulse.blogspot.com/2011/05/rebuilding-radio-nz-part-6-schedules.html | |
require 'open-uri' | |
require 'html_parser_docs' | |
namespace "migrate" do | |
desc "Fetch the remote files" | |
task :fetch_schedules => [:environment] do | |
type = ENV['type'] | |
start = ENV['start'] | |
finish = ENV['finish'] | |
path = '' | |
case type | |
when 'national_schedules' | |
path = 'http://www.radionz.co.nz/national/schedules/' | |
when 'concert_schedules' | |
path = 'http://www.radionz.co.nz/concert/schedules/' | |
end | |
end | |
desc "Import all Radio NZ schedules" | |
task :all_schedules => [:environment] do | |
unless ENV['start'] || ENV['finish'] | |
help | |
exit | |
end | |
n = NationalScheduleLiveParser.new | |
load_pages(Station.national, ENV['start'], ENV['finish'], 'http://www.radionz.co.nz/national/schedules/', n) | |
c = ConcertScheduleLiveParser.new | |
load_pages(Station.concert, ENV['start'], ENV['finish'], 'http://www.radionz.co.nz/concert/schedules/', c) | |
end | |
desc "Import all schedules from radionz.co.nz/national/schedules" | |
task :live_national_schedules => [:environment] do | |
p = NationalScheduleLiveParser.new | |
unless ENV['start'] || ENV['finish'] | |
help | |
exit | |
end | |
load_pages(Station.national, ENV['start'], ENV['finish'], 'http://www.radionz.co.nz/national/schedules/', p) | |
end | |
desc "Import all schedules from radionz.co.nz/concert/schedules" | |
task :live_concert_schedules => [:environment] do | |
p = ConcertScheduleLiveParser.new | |
unless ENV['start'] || ENV['finish'] | |
help | |
exit | |
end | |
load_pages(Station.concert, ENV['start'], ENV['finish'], 'http://www.radionz.co.nz/concert/schedules/', p) | |
end | |
end | |
def load_pages(station, start, finish, base_url, parser) | |
start = Time.parse(start) | |
finish = Time.parse(finish) | |
# reset this inside the model for times when this function is called | |
# several times for different station in a bulk import | |
ScheduleEvent.programmes = nil | |
puts "Starting at #{start}" | |
puts "Finishing at #{finish}" | |
days = ((finish - start) / 60 / 60 / 24) + 1 | |
days.to_i.times do |index| | |
events = [] | |
date = (start + index.days).strftime('%Y%m%d') | |
url = "#{base_url}#{date}" | |
doc = Nokogiri::HTML(get_url(url)) | |
puts "=====================" | |
puts "URL => #{url}" | |
title = doc.at_css("title").text | |
title = title.to_s.gsub!(/\s+/,' ') | |
puts "TITLE => #{title}" | |
html = doc.at_css("#timetable") | |
# remove the CC licnese div from the content as | |
# it cannot be line parsed correctly due to a line break | |
# after the <br> | |
# this is way more elegant anyway.... | |
html.xpath('//div[@class = "license"]').each do |node| | |
node.remove | |
end | |
events = parser.parse(html.to_s) | |
events.sort.each do |index, item| | |
item[:starts_at] = Time.at(index) #.strftime('%Y-%m-%d %H:%M') | |
puts "=======" | |
puts item[:starts_at] | |
puts item[:title] | |
puts item[:body] | |
ScheduleEvent.create_from_load(item, station) | |
end | |
end | |
end | |
=begin | |
Get a url, or use a cached copy of it | |
We cache to speed things up during testing and reduce load | |
on the site during actual import | |
=end | |
def get_url(url) | |
html = '' | |
cache_file_name = url.gsub(/http:\/\//, '').gsub(/\//, '-') + '.html' | |
cache_file = File.join(Rails.root, 'content_to_migrate/cache', cache_file_name ) | |
if FileTest.exist?(cache_file) | |
html = File.read(cache_file) | |
else | |
html = open(url).read | |
File.open(cache_file, 'w') {|f| f.write(html) } | |
end | |
html | |
end | |
def help | |
puts "import:live_schedules - you have missing or invalid options | |
usage: | |
rake import:live_schedules start=yyyymmdd end=yyyymmdd" | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment