metade/delicious_music_news.rb

## delicious_music_news.rb
require 'rubygems'
require 'open-uri'
require 'hpricot'
require 'addressable/uri'
require 'feed-normalizer'
require 'rdelicious'

class DeliciousMusicNews
  def initialize(username, password)
    @links_to_gids = links_to_gids
    @delicious = Rdelicious.new(username, password)
  end

  def process_feed(feed_url)
    results = {}
    feed = FeedNormalizer::FeedNormalizer.parse open(feed_url)
    feed.entries.each do |entry|
      external_links_from_news_story(entry.url).each do |uri|
        gids = gids_for_uri(uri)
        gids.each do |gid|
          results[gid] ||= []
          results[gid] << entry
        end
      end
    end
    results.keys.each do |gid|
      entries = results[gid]
      entries.each do |entry|
        @delicious.add(entry.url, entry.title, entry.description, "musicbrainz/artist/#{gid}") unless @delicious.url_exists?(entry.url)
      end
    end
  end

  private

  def gids_for_uri(uri)
    # search for gids with both a trailing / and without
    uri_string = uri.to_s.gsub(%r[/$], '')
    uri_strings = [ uri_string, "#{uri_string}/" ]
    uri_strings.map { |u| @links_to_gids[u] }.flatten.compact.uniq
  end

  def external_links_from_news_story(url)
    doc = Hpricot(open(url))
    links = []
    doc.search('//a').each do |link|
      uri = Addressable::URI.parse(link.attributes['href'])
      next if (uri.nil? or uri.relative?)
      next if (uri.to_s=~%r[http://(news.*?\.|www\.)?bbc.co.uk] and (uri.to_s=~%r[http://(www\.?)bbc.co.uk/music/artist/\w+])!=0)
      next if (uri.to_s=~%r[http://(del\.icio\.us)])
      links << uri
    end
    links
  end

  def links_to_gids
    links_to_gids = {}
    File.read('artists_urls.txt').each_line do |line|
      gid, url = line.chomp.split("\t")
      links_to_gids[url] ||= []
      links_to_gids[url] << gid
    end
    links_to_gids
  end
end

dmn = DeliciousMusicNews.new('bbcmusicnews', '********')
[
  'http://newsrss.bbc.co.uk/rss/newsbeat/newsbeat/rss.xml',
  'http://newsrss.bbc.co.uk/rss/newsbeat/music/rss.xml',
  'http://newsrss.bbc.co.uk/rss/newsbeat/entertainment/rss.xml',
  'http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/entertainment/rss.xml'
].each { |feed| dmn.process_feed(feed) }
	require 'rubygems'
	require 'open-uri'
	require 'hpricot'
	require 'addressable/uri'
	require 'feed-normalizer'
	require 'rdelicious'

	class DeliciousMusicNews
	def initialize(username, password)
	@links_to_gids = links_to_gids
	@delicious = Rdelicious.new(username, password)
	end

	def process_feed(feed_url)
	results = {}
	feed = FeedNormalizer::FeedNormalizer.parse open(feed_url)
	feed.entries.each do \|entry\|
	external_links_from_news_story(entry.url).each do \|uri\|
	gids = gids_for_uri(uri)
	gids.each do \|gid\|
	results[gid] \|\|= []
	results[gid] << entry
	end
	end
	end
	results.keys.each do \|gid\|
	entries = results[gid]
	entries.each do \|entry\|
	@delicious.add(entry.url, entry.title, entry.description, "musicbrainz/artist/#{gid}") unless @delicious.url_exists?(entry.url)
	end
	end
	end

	private

	def gids_for_uri(uri)
	# search for gids with both a trailing / and without
	uri_string = uri.to_s.gsub(%r[/$], '')
	uri_strings = [ uri_string, "#{uri_string}/" ]
	uri_strings.map { \|u\| @links_to_gids[u] }.flatten.compact.uniq
	end

	def external_links_from_news_story(url)
	doc = Hpricot(open(url))
	links = []
	doc.search('//a').each do \|link\|
	uri = Addressable::URI.parse(link.attributes['href'])
	next if (uri.nil? or uri.relative?)
	next if (uri.to_s=~%r[http://(news.*?\.\|www\.)?bbc.co.uk] and (uri.to_s=~%r[http://(www\.?)bbc.co.uk/music/artist/\w+])!=0)
	next if (uri.to_s=~%r[http://(del\.icio\.us)])
	links << uri
	end
	links
	end

	def links_to_gids
	links_to_gids = {}
	File.read('artists_urls.txt').each_line do \|line\|
	gid, url = line.chomp.split("\t")
	links_to_gids[url] \|\|= []
	links_to_gids[url] << gid
	end
	links_to_gids
	end
	end

	dmn = DeliciousMusicNews.new('bbcmusicnews', '********')
	[
	'http://newsrss.bbc.co.uk/rss/newsbeat/newsbeat/rss.xml',
	'http://newsrss.bbc.co.uk/rss/newsbeat/music/rss.xml',
	'http://newsrss.bbc.co.uk/rss/newsbeat/entertainment/rss.xml',
	'http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/entertainment/rss.xml'
	].each { \|feed\| dmn.process_feed(feed) }