Skip to content

Instantly share code, notes, and snippets.

@metade
Created October 8, 2008 09:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save metade/15486 to your computer and use it in GitHub Desktop.
Save metade/15486 to your computer and use it in GitHub Desktop.
require 'rubygems'
require 'open-uri'
require 'hpricot'
require 'json'
require 'rena'
require 'rdelicious'
def get_pids_in_schedule(url)
schedule = JSON.parse(open(url).read)
broadcasts = schedule['broadcasts']
broadcasts.sort { |a,b| a['start'] <=> b['start'] }.map { |b| b['programme']['pid'] }
end
def get_description(pid)
uri = "http://www.bbc.co.uk/programmes/#{pid}.rdf"
doc = Rena::RdfXmlParser.new(open(uri).read, uri)
s_s = Rena::URIRef.new('http://purl.org/ontology/po/short_synopsis')
m_s = Rena::URIRef.new('http://purl.org/ontology/po/medium_synopsis')
l_s = Rena::URIRef.new('http://purl.org/ontology/po/long_synopsis')
descriptions = []
doc.graph.triples.each do |t|
[s_s, m_s, l_s].each { |p| descriptions << t.object.contents if t.predicate == p }
end
descriptions.sort { |a,b| b.length <=> a.length }.first
end
def get_concepts(description)
url = "http://bbcapps2153.national.core.bbc.co.uk/metadataservices/terms?cis=on&text="
begin
doc = Hpricot.XML(open("#{url}#{URI.escape(description)}"))
rescue
return []
end
(doc/'//owl:sameAs').map { |c| c.attributes['rdf:resource'] }
end
def gids_for_concepts(concepts)
concepts.map { |c| $links_to_gids[c.sub('http://dbpedia.org/resource/','http://en.wikipedia.org/wiki/')] }.flatten.compact
end
def load_links_to_gids
links_to_gids = {}
File.read('artists_urls.txt').each_line do |line|
gid, url = line.chomp.split("\t")
links_to_gids[url] ||= []
links_to_gids[url] << gid
end
links_to_gids
end
# Patch Rdelicious to use BBC HTTP proxy
class Rdelicious
def initialize(id, password)
@delicious_id = id
@delicious_password = password
@http = Net::HTTP.new("api.del.icio.us", 443, 'www-cache.reith.bbc.co.uk', 80)
@http.use_ssl = true
end
end
$links_to_gids = load_links_to_gids
$delicious = Rdelicious.new('bbcmusicprogrammes', 'password')
pids = get_pids_in_schedule('http://www.bbc.co.uk/programmes/genres/music/schedules.json')
pids.each do |pid|
description = get_description(pid)
concepts = get_concepts(description)
gids = gids_for_concepts(concepts)
p [pid, concepts, gids]
unless gids.empty?
url = "http://www.bbc.co.uk/programmes/#{pid}"
tags = gids.map { |gid| "musicbrainz/artist/#{gid}" }.join(" ")
$delicious.add(url, pid, description[0,200], tags) unless $delicious.url_exists?(url)
end
sleep 1
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment