Created
June 28, 2009 01:18
-
-
Save wezm/137179 to your computer and use it in GitHub Desktop.
Script to generate an Atom feed from the VirtualBox News page
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby | |
require 'rubygems' | |
require 'nokogiri' | |
require 'open-uri' | |
require 'uuidtools' | |
require 'date' | |
require 'activesupport' # For DateTime.to_time | |
require 'uri' | |
# Create an Atom feed from the VirtualBox news page | |
class FeedBuilder | |
def initialize(url) | |
@url = url | |
@doc = Nokogiri::HTML(open(url)) | |
build_feed | |
end | |
def parse_date(node) | |
month, day, year = node.content.split(' ') | |
year = year[0..3] # First four chars (get rid of the '.') | |
months = %w(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec) | |
DateTime.civil(year.to_i, months.index(month) + 1, day.to_i) | |
end | |
def extract_title(content) | |
if content =~ /^(.*?)[,!\.] / | |
title = $1.strip.split(' ') | |
if title.size > 10 | |
title = title[0..9] # First 10 words | |
title[-1] += '...' | |
end | |
return title.join(' ') | |
else | |
raise "Unable to extract title from: #{content}" | |
end | |
end | |
# Convert relative links to absolute | |
def resolve_links | |
@doc.css('a').each do |a| | |
url = URI.parse(a['href']) | |
if url.host.nil? | |
url = URI::HTTP.build({ | |
:host => 'www.virtualbox.org', | |
:path => url.path | |
}) | |
a['href'] = url.to_s | |
end | |
end | |
end | |
def build_feed | |
resolve_links | |
@builder = Nokogiri::XML::Builder.new do |feed| | |
feed.feed(:xmlns => "http://www.w3.org/2005/Atom") do | |
last_updated = parse_date(@doc.css('div#searchable ul li strong').first) | |
feed.title "VirtualBox News" | |
feed.link(:href => @url) | |
feed.link( | |
:rel => "self", | |
:type => "application/atom+xml", | |
:href => 'http://home.wezm.net/files/virtualbox.atom' | |
) | |
feed.updated last_updated.to_s | |
feed.author { | |
feed.name "Sun VirtualBox" | |
} | |
feed.id_ 'urn:uuid:' + UUID.timestamp_create(last_updated.to_time).to_s | |
# Add the entries, limit to 10 items | |
@doc.css('div#searchable ul li').each_with_index do |item, index| | |
break if index >= 10 | |
feed.entry { | |
updated = parse_date(item.css('strong').first.remove) | |
content = item.inner_html.lstrip | |
feed.title extract_title(item.content) | |
feed.link(:rel => "alternate", :type => "text/html", :href => @url) | |
feed.id_ 'urn:uuid:' + UUID.timestamp_create(updated.to_time).to_s | |
feed.updated updated.to_s | |
feed.content(content, :type => "html") | |
} | |
end | |
end | |
end | |
end | |
def to_s | |
@builder.to_xml | |
end | |
end | |
if ARGV.size < 1 | |
puts "Usage: virtualbox_feed output.atom" | |
exit 2 | |
end | |
feed = FeedBuilder.new('http://www.virtualbox.org/wiki/News') | |
File.open(ARGV[0], 'w') do |atom| | |
atom << feed.to_s | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment