Skip to content

Instantly share code, notes, and snippets.

@rhulse
Created April 30, 2011 01:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rhulse/949309 to your computer and use it in GitHub Desktop.
Save rhulse/949309 to your computer and use it in GitHub Desktop.
Recipe importer for ELF
# This code is for illustrative purposes only and should be read in conjunction
# with this blog post:
# http://richardhulse.blogspot.com/2011/04/rebuilding-radio-nz-part-4-content.html
# I was still learning Rails and Ruby at the time this was written,
# so it is a but rough and ready.
# This code is released under an MIT license (the same as Rails).
require 'rubygems'
require 'nokogiri'
namespace "import" do
desc "Imports recipes from XML"
task :recipes_from_xml => :environment do
file_name = ENV['file'] || exit
puts 'Reading XML'
file = File.open(file_name)
doc = Nokogiri::XML( file )
file.close
# check for errors
doc.errors.each do |error|
puts "ERROR on #{file_name}: #{error.to_s.strip}\n"
end
recipe_count = 0
valid_count = 0
chef = 0
titles = 0
Recipe.destroy_all
doc.xpath('//recipes//recipe').each do |recipe_data|
recipe_count +=1
r = Nokogiri::XML( recipe_data.to_s )
chef_found = false
r.xpath('//body').each do |recipe|
#puts "======================"
html = tidy(recipe.content)
# remove any divs
html.gsub! /<div>/, ''
html.gsub! /<\/div>/, ''
html.gsub! /<div [^>]*>/, ''
title = ''
chef_name = ''
programme = ''
body = ''
html.each_line do |line|
case line
when /<h2>/
#puts "TITLE: #{line}"
title = line.gsub!( /<(.|\n)*?>/, '')
titles += 1
when /Chef:|as heard on|recipe from|with Jim Mora|recipe by|Chef(.*)as heard/i
line.gsub!( /<(.|\n)*?>/, '')
case line
when /Afternoon/i
programme = 'Afternoons'
when /Nine To Noon/i
programme = 'Nine To Noon'
when /Saturday/i
programme = 'Saturday Morning'
when /Country Life/i
programme = 'Country Life'
when /This Way Up/i
programme = 'This Way Up'
when /Summer Report/i
programme = 'Summer Report'
when /Nights/i
programme = 'Nights'
else
programme = 'none'
end
if line =~ /Chef(:)?(.*)(as heard on|editor)?/
chef_name = $2
chef_name.gsub!( /,/, '')
chef_name.gsub!( /&nbsp;/, '')
chef_name =~ /(\w+) (\w+)/
chef = chef + 1
chef_found = true
end
else
body << line
end
end
date = DateTime.parse(r.xpath('//date').first.content) rescue nil
# these are the valid ones to import
if chef_found && (programme != 'none') && (! title.empty?)
valid_count += 1
# puts "CHEF: #{chef_name}"
# puts "TITLE: #{title}"
# puts "PROG: #{programme}"
# puts "Date: #{date}"
# import to ELF
first_name, last_name = chef_name.split(' ')
last_name.strip!
last_name.gsub!( /'s$/, '' )
last_name.gsub!( /s'$/, '' )
title = CGI.unescapeHTML(title)
recipe = Recipe.find_by_title(title)
if recipe
recipe.chefs.each {|chef_obj| chef_obj.destroy }
end
recipe ||= begin
r = Recipe.new
r.content ||= SupportingContent.new(:title => title.strip)
r
end
person = Person.find_or_create_by_first_name_and_last_name(first_name, last_name)
programme_obj = Programme.find_or_create_by_name(programme)
recipe.content.attributes = {:body => body, :body_updated_at => date, :published_at => date}
recipe.attributes = {:programme => programme_obj}
recipe.broadcast_at = date
recipe.chefs << person
recipe.save! && recipe.content.save!
else
puts "MISSED!"
puts "CHEF: #{chef_name}"
puts "TITLE: #{title}"
puts "PROG: #{programme}"
puts "Date: #{date}"
end
end
end
# data output
puts "Recipe count: #{recipe_count} "
puts "Title count: #{titles} "
puts "Parse success: #{valid_count} "
end
def tidy(data)
cleaned = nil
tidy = IO.popen('tidy -f "log/tidy.log" --output-xhtml 1 --wrap 0 --show-body-only 1 --drop-empty-paras 1 --force-output yes -wrap 0 -utf8', 'w+')
begin
tidy.write(data)
tidy.close_write
cleaned = tidy.read
tidy.close_read
rescue Errno::EPIPE
$stderr.print "Running 'tidy' failed: " + $!
tidy.close
end
return cleaned if cleaned and cleaned != ""
return data
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment