#!/usr/bin/ruby
#
# == Synopsis
#
# mtscrape: parse and download streams of ZDF.de Mediathek
#
# == Usage
#
# mtscrape [OPTIONS] [SOURCES]
#
# The following OPTIONS can be specified:
#
# -a, --age Maximum age of feed item
# -c, --convert Convert wmv to ogg/theora
# -d, --dir Output directory
# -f, --fast Stream low-quality (faster)
# -m, --match Regular expression to match against items
# -h, --help This help text
# -v, --verbose Verbose output
#
# Note: -a and -m are only used in combination with -C
# -f is not used in combination with -A
#
# The following SOURCES can be specified:
#
# -A, --asx Parse ASX link/file directly
# -C, --category Parse RSS feed of mediathek category ID
# -I, --item Parse JSON of mediathek item ID
# -L, --link Parse JSON of mediathek link
#
# Note: you can specify all sources multiple times to download multiple items.
#
# == Examples
#
# This command will download the item with ID 257404 (heute 100sec, 15.10.07):
#
# mtscrape -v -I 257404
#
# Same file as above, but using the ASX file directly (easier to find in the webinterface):
#
# mtscrape -v -A http://wstreaming.zdf.de/zdf/veryhigh/071015_hko_2000.asx
#
# Same file as above, but using the HTTP link directly (easier to find in the RSS feed):
#
# mtscrape -v -L http://www.zdf.de/ZDFmediathek/content/heute_100SEC/166/257404
#
# This will stream all items from category 208 (JBK) and 414 (Maybritt Illner)
# since one week ago into directory /data/talkshows/:
#
# mtscrape -v -a 7 -d /data/talkshows -C 208 -C 414
#
# This will stream all items from category ID 228 (heute) since yesterday whose
# title matches the regular expression '.*heute-journal.*':
#
# mtscrape -v -a 1 -C 228 -m '.*heute-journal.*'
#
# == Bugs
#
# This script will not work with livestreams. mtscrape has been designed for
# automatic download of ondemand content, but downloading (parts) of the
# livestream requires manual work (i.e. stoping the recording) and is therefore
# not implemented in this script.
#
# In fact, mtscrape even prevents livestreams from being downloaded if they appear
# in the RSS feeds.
#
# To dump the livestream you can use mplayer directly:
#
# mplayer \
# -playlist http://wgeostreaming.zdf.de/encoder/livestream15_vh.asx \
# -dumpstream \
# -dumpfile zdf_stream.wmv
#
# == Requirements
#
# To use this script you need the following software installed on your system:
#
# - Ruby-1.8.x (apt-get install ruby1.8 rubygems, emerge ruby, etc)
# - Ruby-JSON (gem install json)
# - LibXML-Ruby (gem install libxml-ruby)
# - mplayer (apt-get install mplayer, emerge mplayer, etc)
# - ffmpeg2theora (only for --convert)
#
# == License
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see .
#
# == Copyright
#
# 2007 Benedikt Boehm
#
# == ChangeLog
#
# - Oct 26 2007: add --convert (v0.2)
# - Oct 15 2007: initial release (v0.1)
require 'rubygems'
require 'date'
require 'getoptlong'
require 'json' # gem install json
require 'open-uri'
require 'rdoc/usage'
require 'rss/1.0'
require 'rss/2.0'
require 'xml/libxml' # gem install libxml-ruby
@age = Time.now - (5 * 24 * 3600)
@convert = false
@bw = 'dsl2000'
@dir = '.'
@filter = nil
@verbose = false
@asxs = []
@cats = []
@items = []
@links = []
opts = GetoptLong.new(
[ '--age', '-a', GetoptLong::REQUIRED_ARGUMENT ],
[ '--convert', '-c', GetoptLong::NO_ARGUMENT ],
[ '--dir', '-d', GetoptLong::REQUIRED_ARGUMENT ],
[ '--fast', '-f', GetoptLong::NO_ARGUMENT ],
[ '--match', '-m', GetoptLong::REQUIRED_ARGUMENT ],
[ '--help', '-h', GetoptLong::NO_ARGUMENT ],
[ '--verbose', '-v', GetoptLong::NO_ARGUMENT ],
[ '--asx', '-A', GetoptLong::REQUIRED_ARGUMENT ],
[ '--category', '-C', GetoptLong::REQUIRED_ARGUMENT ],
[ '--item', '-I', GetoptLong::REQUIRED_ARGUMENT ],
[ '--link', '-L', GetoptLong::REQUIRED_ARGUMENT ]
)
def mt_parse_asx(link)
data = ""
open(link) do |s| data = s.read end
XML::Parser.string(data).parse.root.find('Entry').first.find('Ref').first['href']
end
def mt_stream_url(link)
data = ""
open("#{link}?&bw=#{@bw}&pp=wmp&view=navJson") do |s| data = s.read end
json = JSON.parse(data)
return "INVALID:#{json['assetType']}" if json['assetType'] != "video"
mt_parse_asx(json['assetUrl'])
end
@spinner = [ '-', '\\', '|', '/' ]
def wait(pid)
if @verbose
i = 0
while Process::waitpid(pid, Process::WNOHANG) == nil
print "\b#{@spinner[i]}"
STDOUT.flush
i = (i + 1) % 4
sleep 0.05
end
puts "\b\b."
else
Process::waitpid(pid, 0)
end
end
def dump_stream(url)
match = /^INVALID:(.*)/.match(url)
if match
puts "[skip] this looks like a #{match[1]} source"
return
end
basename = url.split("/").last
outfile = "#{@dir}/#{basename}"
begin File::Stat.new(outfile)
puts "[skip] File already exists (#{outfile})" if @verbose
rescue
cmd = "mplayer -nolirc -really-quiet -dumpstream -dumpfile '#{outfile}' '#{url}'"
print "[dump] #{url} -> #{outfile} " if @verbose
wait(fork do system(cmd) end)
convert_stream(outfile) if @convert
end
end
def convert_stream(file)
print "[conv] #{file} " if @verbose
wait(fork do system("ffmpeg2theora '#{file}'") end)
end
opts.each do |opt, arg|
case opt
when '--age'
@age = Time.now - (arg.to_i * 24 * 3600)
when '--convert'
@convert = true
when '--dir'
@dir = arg
when '--fast'
@bw = 'dsl1000'
when '--match'
@filter = Regexp.new(arg, true, 'u')
when '--help'
RDoc::usage
exit 0
when '--verbose'
@verbose = true
when '--asx'
@asxs << arg
when '--category'
@cats << arg.to_i
when '--item'
@items << arg.to_i
when '--link'
@links << arg
end
end
@asxs.each do |asx|
dump_stream(mt_parse_asx(asx))
end
@cats.each do |cat|
rss = RSS::Parser.parse("http://www.zdf.de/ZDFmediathek/content/#{cat}?view=rss")
rss.items.each do |item|
if item.date >= @age
puts "[test] #{item.title}" if @verbose
match = @filter.nil? ? true : item.title =~ @filter
dump_stream(mt_stream_url(item.link)) if match
end
end
end
@items.each do |item|
dump_stream(mt_stream_url("http://www.zdf.de/ZDFmediathek/content/#{item}"))
end
@links.each do |link|
dump_stream(mt_stream_url(link))
end