Skip to content

Instantly share code, notes, and snippets.

@wezm
Created June 28, 2009 01:18
Show Gist options
  • Save wezm/137179 to your computer and use it in GitHub Desktop.
Save wezm/137179 to your computer and use it in GitHub Desktop.
Script to generate an Atom feed from the VirtualBox News page
#!/usr/bin/ruby
require 'rubygems'
require 'nokogiri'
require 'open-uri'
require 'uuidtools'
require 'date'
require 'activesupport' # For DateTime.to_time
require 'uri'
# Create an Atom feed from the VirtualBox news page
class FeedBuilder
def initialize(url)
@url = url
@doc = Nokogiri::HTML(open(url))
build_feed
end
def parse_date(node)
month, day, year = node.content.split(' ')
year = year[0..3] # First four chars (get rid of the '.')
months = %w(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec)
DateTime.civil(year.to_i, months.index(month) + 1, day.to_i)
end
def extract_title(content)
if content =~ /^(.*?)[,!\.] /
title = $1.strip.split(' ')
if title.size > 10
title = title[0..9] # First 10 words
title[-1] += '...'
end
return title.join(' ')
else
raise "Unable to extract title from: #{content}"
end
end
# Convert relative links to absolute
def resolve_links
@doc.css('a').each do |a|
url = URI.parse(a['href'])
if url.host.nil?
url = URI::HTTP.build({
:host => 'www.virtualbox.org',
:path => url.path
})
a['href'] = url.to_s
end
end
end
def build_feed
resolve_links
@builder = Nokogiri::XML::Builder.new do |feed|
feed.feed(:xmlns => "http://www.w3.org/2005/Atom") do
last_updated = parse_date(@doc.css('div#searchable ul li strong').first)
feed.title "VirtualBox News"
feed.link(:href => @url)
feed.link(
:rel => "self",
:type => "application/atom+xml",
:href => 'http://home.wezm.net/files/virtualbox.atom'
)
feed.updated last_updated.to_s
feed.author {
feed.name "Sun VirtualBox"
}
feed.id_ 'urn:uuid:' + UUID.timestamp_create(last_updated.to_time).to_s
# Add the entries, limit to 10 items
@doc.css('div#searchable ul li').each_with_index do |item, index|
break if index >= 10
feed.entry {
updated = parse_date(item.css('strong').first.remove)
content = item.inner_html.lstrip
feed.title extract_title(item.content)
feed.link(:rel => "alternate", :type => "text/html", :href => @url)
feed.id_ 'urn:uuid:' + UUID.timestamp_create(updated.to_time).to_s
feed.updated updated.to_s
feed.content(content, :type => "html")
}
end
end
end
end
def to_s
@builder.to_xml
end
end
if ARGV.size < 1
puts "Usage: virtualbox_feed output.atom"
exit 2
end
feed = FeedBuilder.new('http://www.virtualbox.org/wiki/News')
File.open(ARGV[0], 'w') do |atom|
atom << feed.to_s
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment