Skip to content

Instantly share code, notes, and snippets.

@mro
Created March 27, 2010 12:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save mro/345983 to your computer and use it in GitHub Desktop.
Save mro/345983 to your computer and use it in GitHub Desktop.
#
# Bayern2 Programmseite abgrasen
#
require 'time'
# sudo gem install scrapi
#
# http://exceptionz.wordpress.com/2009/11/03/scrapi-on-snow-leopard/
require 'scrapi'
require 'cgi'
require 'sqlite3'
class Tools
def self.scrape_options
return {:user_agent=>'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_7; de-de) AppleWebKit/530.19.2 (KHTML, like Gecko) Version/4.0.2 Safari/530.19',
:parser_options => {
"input-encoding" => :latin1,
"output-encoding" => :utf8,
'preserve-entities' => :false,
"quote-marks" => :true,
'output-xhtml' => :true
}
}
end
def self.unescapeHTML html
return html if html.nil?
html = CGI.unescapeHTML html
html.gsub! ' ', ' '
html.gsub! ''', '\''
html.gsub! '–', '–'
html.gsub! '‘', '‘'
html.gsub! '’', '’'
html.gsub! '“', '“'
html.gsub! '”', '”'
html.gsub! '„', '„'
html.gsub! '…', '…'
html.gsub! '€', '€'
html.gsub! 'Δ', 'Δ'
html
end
end
class Bayern2
def self.clean raw, now
ret = {}
m = /([0-9]{2}:[0-9]{2}).*Uhr/.match( Tools.unescapeHTML( raw.start ) )
if m.nil?
$stderr.put "mismatch: #{raw.start}"
else
base_date = now
base_date = base_date + 24*60*60 if m[1] < '05:00' # vor Sendeschluß: Programm vom folgenden Tag
ret[:dtstart] = Time.parse "#{base_date.strftime '%Y-%m-%dT'}#{m[1]}:00"
end
ret[:href] = "http://www.br-online.de#{raw.href.gsub(/;jsessionid=[A-Z0-9]+/,'')}" if raw.href
ret[:title] = Tools.unescapeHTML raw.title
ret[:description] = Tools.unescapeHTML(raw.description.to_s).gsub("\n"," ").gsub(/ *<br *\/?> */,"\n").gsub(/ *<\/?p> */,'')
ret[:description] = nil if ret[:description] == ''
ret
end
def self.stream_uri
URI.parse 'http://gffstream.ic.llnwd.net/stream/gffstream_w11a'
end
def self.programm_uri_for_date date
# Uhrzeit < 5:00 - vor Sendeschluß: Programm Webpage vom Vortag!
date = date - 24*60*60 if date.hour < 5
URI.parse "http://www.br-online.de/br/jsp/global/funktion/programmvorschau/programmfahne.jsp?programm=B2&datum=#{date.strftime '%d.%m.%Y'}"
end
def self.programm now=Time.now
# TODO limit access: 2009-01-31 < t < Time.now + 24h
b2_sendung = Scraper.define do
attr_accessor :start, :title, :href, :description
process "th", :start => :text
process "td > h4", :title => :text
process "td > h4 > a", :href => '@href'
process "td > p", :description => :element
result :start, :title, :href, :description
end
b2_programm = Scraper.define do
attr_accessor :sendungen
array :sendungen
process "html body div#Inhalt tbody tr", :sendungen => b2_sendung
result :sendungen
end
ret = []
prev = nil
b2_programm.scrape(Bayern2.programm_uri_for_date(now), Tools.scrape_options).each do |raw|
current = Bayern2.clean(raw, now)
# $stderr.puts "Cleaned #{current[:dtstart]}: #{current[:title]}"
if prev
raise "dtend >= dtstart: #{prev[:dtstart]} >= #{current[:dtstart]}: #{current[:title]}" if prev[:dtstart] >= current[:dtstart]
prev[:dtend] = current[:dtstart]
end
ret << (prev = current)
end
ret
end
def self.sendung now=Time.now
Bayern2.programm(now).each do |s|
return s if s[:dtstart] <= now && now < s[:dtend]
end
nil
end
def self.open_db
db = SQLite3::Database.new( "#{File.expand_path(File.dirname(__FILE__))}/bayern2.sqlite" )
db.execute <<SQL
CREATE TABLE IF NOT EXISTS programm (
dtstart VARCHAR(28),
dtend VARCHAR(28),
title TEXT,
description TEXT,
href TEXT
)
SQL
db.execute 'CREATE UNIQUE INDEX IF NOT EXISTS programm_idx ON programm ( dtstart, dtend )'
db
end
def self.refill_into_hash rr
ret = { :rowid => rr[0].to_i }
ret[:dtstart] = Time.parse(rr[1]) if ! rr[1].nil?
ret[:dtend] = Time.parse(rr[2]) if ! rr[2].nil?
ret[:title] = rr[3] if ! rr[3].nil?
ret[:description] = rr[4] if ! rr[4].nil?
ret[:href] = rr[5] if ! rr[5].nil?
ret
end
def self.iso t
return nil if t.nil?
t.strftime '%Y-%m-%dT%H:%M:%S%z'
end
# public
def self.scrape now=Time.now
db = Bayern2.open_db
# look up if we have to scrape at all?
sql_select = "SELECT rowid FROM programm WHERE dtstart <= ? AND ? < dtend"
r = db.execute( sql_select, Bayern2.iso(now), Bayern2.iso(now))
if r.length == 0
$stderr.puts "scraping #{now}..."
begin
db.transaction do |txn|
Bayern2.programm(now).each do |s|
# $stderr.puts "insert #{iso(s[:dtstart])} - #{iso(s[:dtend])} #{s[:title]}"
txn.execute( "INSERT INTO programm (dtstart,dtend,title,description,href) VALUES (?,?,?,?,?)", iso(s[:dtstart]), iso(s[:dtend]), s[:title], s[:description], s[:href])
end
sleep 0.5
end
rescue SQLite3::SQLException => e
$stderr.puts "#{e}"
end
end
db.close
end
# public
def self.findBroadcastByRowIds argv
return [] if argv.nil? || argv.length < 1
raise "Currently only one id allowed." if argv.length > 1
db = Bayern2.open_db
sql_select = "SELECT rowid,dtstart,dtend,title,description,href FROM programm WHERE rowid = ?"
r = db.execute( sql_select, argv[0] )
ret = []
r.each {|rr| ret << Bayern2.refill_into_hash(rr)}
db.close
ret
end
# public
def self.findNextBroadcastByTitleLike title, now = Time.now
db = Bayern2.open_db
sql_select = "SELECT rowid,dtstart,dtend,title,description,href FROM programm WHERE title like ? AND dtend > ? ORDER BY dtstart ASC LIMIT 1"
r = db.execute( sql_select, title, Bayern2.iso(now) )
ret = []
r.each {|rr| ret << Bayern2.refill_into_hash(rr)}
db.close
ret[0].nil? ? nil : ret[0]
end
# public
def self.findBroadcastsInTimeInterval start = Time.now, stop = start
db = Bayern2.open_db
sql_select = "SELECT rowid,dtstart,dtend,title,description,href FROM programm WHERE dtstart <= ? AND ? < dtend ORDER BY dtstart ASC"
r = db.execute( sql_select, Bayern2.iso(start), Bayern2.iso(stop) )
ret = []
r.each {|rr| ret << Bayern2.refill_into_hash(rr)}
db.close
ret
end
end
#!/usr/bin/ruby -rubygems
require "#{File.dirname __FILE__}/../programm.rb"
require 'rexml/document'
# refactor to remove
class XmlWriter
@xml = nil
@root = nil
@dst = nil
def write hash, dst=$stdout
return if hash[:title].nil? || hash[:title] == ''
@xml = REXML::Document.new if @xml.nil?
@root = REXML::Element.new 'stream', @xml if @root.nil?
@dst = dst
track = REXML::Element.new 'track', @root
hash.each do |k,v|
elem = REXML::Element.new k.to_s, track
txt = v.to_s
txt = v.strftime '%Y-%m-%dT%H:%M:%S%z' if v.kind_of? Time
elem.text = txt
end
end
def xml
@xml
end
def flush
return if @dst.nil?
@dst.puts @xml if ! @xml.nil?
@dst.flush
end
end
def write_recording_xml dst=$stdout, recordings=Time.now
# Bayern2.scrape now
w = XmlWriter.new
recordings = Bayern2.findBroadcastsInTimeInterval(recordings) if recordings.kind_of? Time
recordings = [ recordings ] if recordings.kind_of? Hash
recordings.each do |bc|
# $stderr.puts bc
recording = {
:start => bc[:dtstart],
:stop => bc[:dtend],
:stream_url => Bayern2.stream_uri,
:program_url=> Bayern2.programm_uri_for_date(bc[:dtstart]),
:album => 'B2 Zündfunk',
:artist => 'B2 Zündfunk',
:title => "#{bc[:title]}: #{bc[:description]}"
}
w.write recording, dst
end
w.flush
end
def max a, b
a > b ? a : b
end
def min a, b
a > b ? b : a
end
# http://code.google.com/p/xstreamripper/source/browse/trunk/streamripper/fetch_external_metadata.pl
if ARGV[0] == '-fetch_external_metadata_for_broadcast_rowids'
ARGV[0] = nil
ARGV.compact!
broadcasts = Bayern2.findBroadcastByRowIds ARGV
if broadcasts.length == 0
$stderr.puts "I need rowids!"
exit 1
end
if broadcasts.length > 1
$stderr.puts "Sorry, but currently I only support one rowid."
exit 2
end
bc = broadcasts[0]
# endless loop
while true
dt = -Time.now.to_f + bc[:dtstart].to_f - 0 # wait until n sec before broadcast start
if dt > 0
# don't write a prefix!
# streamripper seems to write this once anyway, so if we want to cut off a header
# we better don't switch the meta-data before e actually want to start recording...
# puts '.'
else
dt = -Time.now.to_f + bc[:dtend].to_f + 5 # record until n sec after broadcast end
if dt > 0
puts "TITLE=recording\nARTIST=Bayern2\nALBUM=Zuendfunk\n."
else
puts "TITLE=suffix\nARTIST=Bayern2\nALBUM=Zuendfunk\n."
dt = 1e3
end
end
$stdout.flush
$stderr.puts "\t\tsleep for dt=#{dt} until #{Time.now + max(0.01, min(10, dt) )}"
$stderr.flush
sleep max(0.01, min(10, dt) )
end
# never reached
exit 0
end
# Aufnahme
if ARGV[0].nil?
Bayern2.scrape Time.now # - Dummy Lookup um Scrape zu erzwingen
# - wann beginnt der nächste Zündfunk - oder läuft bereits einer (limit 1 dtend > now AND title like '%Zündfunk%').
bc = Bayern2.findNextBroadcastByTitleLike '%Zündfunk%', Time.now
exit 0 if bc.nil?
exit 3 if bc[:dtend].nil?
exit 4 if bc[:dtstart].nil?
exit 0 if -Time.now.to_f + bc[:dtstart].to_f > 6*60*60
# - warten bis 60 sec vor Sendung
dt = -Time.now.to_f + bc[:dtstart].to_f - 60
$stderr.puts "waiting until #{bc[:dtstart] - 60}"
sleep dt if dt > 0
# - streamripper starten - rowid(s) als Parameter übergeben
dt = -Time.now.to_f + bc[:dtend].to_f + 120
dt = dt.to_i
exit 6 if dt <= 0
dst_dir = File.dirname(__FILE__)
# find new (max+1) id to use and build a filename as streamripper would
maxid = 0
Dir.new(dst_dir).each do |file|
# non-greedy prefix match:
m = /.*?([0-9]+)-([0-9]{4})_([0-9]{2})_([0-9]{2})_([0-9]{2})_([0-9]{2})_([0-9]{2})\.mp3/.match file
next if m.nil?
current_id = m[1].to_i
maxid = current_id if current_id > maxid
end
dst_file = "#{maxid + 1}-#{bc[:dtend].strftime '%Y_%m_%d_%H_%M_%S'}.mp3"
# clean incomplete
system "rm #{dst_dir}/incomplete/*"
cmd = "/home/username/bin/streamripper '#{Bayern2.stream_uri}' -t -l #{dt} -u 'Mozilla' -s -d '#{dst_dir}' -a '#{dst_file}' -E '#{File.expand_path(__FILE__)} -fetch_external_metadata_for_broadcast_rowids #{bc[:rowid]}' 1> /dev/null"
$stderr.puts cmd
if system(cmd)
$stderr.puts 'ripping ok!'
# aggregate recordings
if Dir["#{dst_dir}/#{dst_file}"].length == 1
# regular recording file written, incomplete mustn't contain recordings.
if Dir["#{dst_dir}/incomplete/*recording.mp3"].length > 0
$stderr.puts "ERROR: OMG, found incomplete/*recording.mp3 AND #{dst_dir}/#{dst_file}"
else
# we're fine - the complete recording is in place.
end
else
# no 'regular' recording, so we aggregate all recordings from incomplete
# 'cat' is brute force, but works out fine.
system "cat #{dst_dir}/incomplete/*recording.mp3 > #{dst_dir}/#{dst_file}"
end
dst = "#{dst_dir}/#{dst_file}.xml"
File.open( dst, "w" ) { |f| write_recording_xml( f, bc ) }
$stderr.puts "wrote track description #{dst}"
else
$stderr.puts "ripping failed #{$?}"
exit 5
end
end
#################################################################################
#################################################################################
#################################################################################
# ugly mess, but working:
# - podcast.rss bauen + zippen
def hash_to_xml hash, parent, attributes=false, date_fmt='%d %b %Y %H:%M:%S %z' # RFC 822 date-time
hash.each do |k,v|
txt = nil
txt = v if v.kind_of? String
txt = v.to_s if v.kind_of? Numeric
txt = v.strftime date_fmt if v.kind_of? Time
if attributes
parent.add_attribute k.to_s, txt
else
elem = REXML::Element.new( k.to_s, parent )
if !txt.nil?
elem.text = txt
else
use_atts = 'enclosure' == k.to_s || 'itunes:image' == k.to_s
hash_to_xml( v, elem, use_atts ) if v.kind_of? Hash
end
end
end
end
def create_feed hash, items
xml = REXML::Document.new
xml << REXML::XMLDecl.new( '1.0', 'utf-8', 'yes' )
rss = REXML::Element.new('rss', xml)
rss.add_attribute 'xmlns:itunes', 'http://www.itunes.com/dtds/podcast-1.0.dtd'
rss.add_attribute 'version', '2.0'
channel = REXML::Element.new('channel', rss)
hash_to_xml hash, channel
meta = { :lastBuildDate => Time.now, :pubDate => Time.now, :generator => 'streamripper to rss ruby script (C) Marcus Rohrmoser 2010' }
hash_to_xml meta, channel
cats = REXML::Element.new('itunes:category', channel)
cats.add_attribute 'text', 'Arts'
REXML::Element.new('itunes:category', cats).add_attribute 'text', 'Literature'
# REXML::Element.new('itunes:category', cats).add_attribute 'text', 'Arts'
items.each { |i| hash_to_item i, channel }
xml
end
def hash_to_item episode, channel
item = REXML::Element.new 'item', channel
hash_to_xml episode, item
end
def seconds_to_rss_duration seconds
hours = (seconds / 3600).to_i.to_s
minutes = (seconds / 60 % 60).to_i.to_s.rjust(2).gsub(/ /, '0')
secs = (seconds % 60).to_i.to_s.rjust(2).gsub(/ /, '0')
"#{hours}:#{minutes}:#{secs}"
end
def load_recording file
record = REXML::Document.new file
record.each_element('/stream/track') do |track|
meta = {}
track.each_element('*') do |elem|
val = elem.text.to_s
val = Time.parse val if elem.name == 'start'
val = Time.parse val if elem.name == 'stop'
meta[ elem.name.intern ] = val
end
$stderr.puts "loaded #{file}"
return meta
end
end
#####################################################################
## build the rss feed
#####################################################################
def build_rss dst_dir, channel
dst_dir = File.expand_path dst_dir
items = []
Dir.new(dst_dir).sort.each do |file|
# mp3 files as spit out from streamripper
m = /.*([0-9]{4}_[0-9]{2}_[0-9]{2}_[0-9]{2}_[0-9]{2}_[0-9]{2})\.mp3$/.match file
next if m.nil?
url = "#{dst_dir}/#{file}".gsub(/.*\/recorder\//, "http://podcasts.example.com/")
meta = nil
File.open("#{dst_dir}/#{file}.xml", 'r' ) {|f| meta = load_recording f}
seconds = meta[:stop] - meta[:start]
items << {
:description => meta[:title],
:enclosure => { :url => url, :length => File.stat("#{dst_dir}/#{file}").size, :type => 'audio/mpeg' },
:guid => url,
:pubDate => meta[:start],
'itunes:explicit' => 'clean',
'itunes:duration' => seconds_to_rss_duration(seconds),
:title => "#{meta[:start].strftime '%A, %d. %B %Y'}",
'itunes:author' => 'Zündfunk', # overwrite id3 author causing iTunes encoding issues
'itunes:keywords' => 'Bayern2,Zündfunk,Jugend,Pop'
}
$stderr.puts "analyzed #{dst_dir}/#{file}.xml"
end
create_feed(channel, items)
end
channel = {
:title => 'B2 Zündfunk',
:language => 'de',
'itunes:explicit' => 'clean',
:description => 'Radiomitschnitt Bayern 2 Werktags 19:00 - 20:20.',
:link => 'http://www.br-online.de/bayern2/zuendfunk/index.xml',
'itunes:subtitle' => 'Die Jugendwelle im BR.',
# 'itunes:category' => { :text => 'Society &amp; Culture' },
'itunes:summary' => 'zeitgenössische akustische Popkultur.',
'itunes:author' => 'Zündfunk',
'itunes:owner' => {
'itunes:name' => 'My Name',
'itunes:email' => 'email@example.com'
},
'itunes:image' => {
:href => 'http://www.br-online.de/content/cms/Universalseite/2008/03/09/cumulus/BR-online-Publikation--229136-20081103160106.jpg'
},
:image => {
:url => 'http://www.br-online.de/content/cms/Universalseite/2008/03/09/cumulus/BR-online-Publikation--229136-20081103160106.jpg',
:title => 'B2 Zündfunk',
:link => 'http://www.br-online.de/bayern2/zuendfunk/index.xml'
}
}
rss = build_rss File.dirname(__FILE__), channel
dst = "#{File.dirname(__FILE__)}/podcast.rss"
File.open(dst, "w") {|f| rss.write f}
system "gzip --best < #{dst} > #{dst}z"
$stderr.puts "wrote podcast #{dst}z"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment