Skip to content

Instantly share code, notes, and snippets.

@rhulse
Created May 13, 2011 21:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rhulse/971345 to your computer and use it in GitHub Desktop.
Save rhulse/971345 to your computer and use it in GitHub Desktop.
ELF Schedule Importer
# This code is for illustrative purposes only and should be read in conjunction
# with this blog post:
# http://richardhulse.blogspot.com/2011/05/rebuilding-radio-nz-part-6-schedules.html
require 'open-uri'
require 'html_parser_docs'
namespace "migrate" do
desc "Fetch the remote files"
task :fetch_schedules => [:environment] do
type = ENV['type']
start = ENV['start']
finish = ENV['finish']
path = ''
case type
when 'national_schedules'
path = 'http://www.radionz.co.nz/national/schedules/'
when 'concert_schedules'
path = 'http://www.radionz.co.nz/concert/schedules/'
end
end
desc "Import all Radio NZ schedules"
task :all_schedules => [:environment] do
unless ENV['start'] || ENV['finish']
help
exit
end
n = NationalScheduleLiveParser.new
load_pages(Station.national, ENV['start'], ENV['finish'], 'http://www.radionz.co.nz/national/schedules/', n)
c = ConcertScheduleLiveParser.new
load_pages(Station.concert, ENV['start'], ENV['finish'], 'http://www.radionz.co.nz/concert/schedules/', c)
end
desc "Import all schedules from radionz.co.nz/national/schedules"
task :live_national_schedules => [:environment] do
p = NationalScheduleLiveParser.new
unless ENV['start'] || ENV['finish']
help
exit
end
load_pages(Station.national, ENV['start'], ENV['finish'], 'http://www.radionz.co.nz/national/schedules/', p)
end
desc "Import all schedules from radionz.co.nz/concert/schedules"
task :live_concert_schedules => [:environment] do
p = ConcertScheduleLiveParser.new
unless ENV['start'] || ENV['finish']
help
exit
end
load_pages(Station.concert, ENV['start'], ENV['finish'], 'http://www.radionz.co.nz/concert/schedules/', p)
end
end
def load_pages(station, start, finish, base_url, parser)
start = Time.parse(start)
finish = Time.parse(finish)
# reset this inside the model for times when this function is called
# several times for different station in a bulk import
ScheduleEvent.programmes = nil
puts "Starting at #{start}"
puts "Finishing at #{finish}"
days = ((finish - start) / 60 / 60 / 24) + 1
days.to_i.times do |index|
events = []
date = (start + index.days).strftime('%Y%m%d')
url = "#{base_url}#{date}"
doc = Nokogiri::HTML(get_url(url))
puts "====================="
puts "URL => #{url}"
title = doc.at_css("title").text
title = title.to_s.gsub!(/\s+/,' ')
puts "TITLE => #{title}"
html = doc.at_css("#timetable")
# remove the CC licnese div from the content as
# it cannot be line parsed correctly due to a line break
# after the <br>
# this is way more elegant anyway....
html.xpath('//div[@class = "license"]').each do |node|
node.remove
end
events = parser.parse(html.to_s)
events.sort.each do |index, item|
item[:starts_at] = Time.at(index) #.strftime('%Y-%m-%d %H:%M')
puts "======="
puts item[:starts_at]
puts item[:title]
puts item[:body]
ScheduleEvent.create_from_load(item, station)
end
end
end
=begin
Get a url, or use a cached copy of it
We cache to speed things up during testing and reduce load
on the site during actual import
=end
def get_url(url)
html = ''
cache_file_name = url.gsub(/http:\/\//, '').gsub(/\//, '-') + '.html'
cache_file = File.join(Rails.root, 'content_to_migrate/cache', cache_file_name )
if FileTest.exist?(cache_file)
html = File.read(cache_file)
else
html = open(url).read
File.open(cache_file, 'w') {|f| f.write(html) }
end
html
end
def help
puts "import:live_schedules - you have missing or invalid options
usage:
rake import:live_schedules start=yyyymmdd end=yyyymmdd"
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment