#!/usr/bin/ruby
#
# == Synopsis
#
# mtscrape: parse and download streams of ZDF.de Mediathek
#
# == Usage
#
# mtscrape [OPTIONS] [SOURCES]
#
#  The following OPTIONS can be specified:
#
#   -a, --age       Maximum age of feed item
#   -c, --convert   Convert wmv to ogg/theora
#   -d, --dir       Output directory
#   -f, --fast      Stream low-quality (faster)
#   -m, --match     Regular expression to match against items
#   -h, --help      This help text
#   -v, --verbose   Verbose output
#
#  Note: -a and -m are only used in combination with -C
#        -f is not used in combination with -A
#
#  The following SOURCES can be specified:
#
#   -A, --asx       Parse ASX link/file directly
#   -C, --category  Parse RSS feed of mediathek category ID
#   -I, --item      Parse JSON of mediathek item ID
#   -L, --link      Parse JSON of mediathek link
#
#  Note: you can specify all sources multiple times to download multiple items.
#
# == Examples
#
# This command will download the item with ID 257404 (heute 100sec, 15.10.07):
#
#   mtscrape -v -I 257404
#
# Same file as above, but using the ASX file directly (easier to find in the webinterface):
#
#  mtscrape -v -A http://wstreaming.zdf.de/zdf/veryhigh/071015_hko_2000.asx
#
# Same file as above, but using the HTTP link directly (easier to find in the RSS feed):
#
#  mtscrape -v -L http://www.zdf.de/ZDFmediathek/content/heute_100SEC/166/257404
#
# This will stream all items from category 208 (JBK) and 414 (Maybritt Illner)
# since one week ago into directory /data/talkshows/:
#
#   mtscrape -v -a 7 -d /data/talkshows -C 208 -C 414
#
# This will stream all items from category ID 228 (heute) since yesterday whose
# title matches the regular expression '.*heute-journal.*':
#
#   mtscrape -v -a 1 -C 228 -m '.*heute-journal.*'
#
# == Bugs
#
# This script will not work with livestreams. mtscrape has been designed for
# automatic download of ondemand content, but downloading (parts) of the
# livestream requires manual work (i.e. stoping the recording) and is therefore
# not implemented in this script.
#
# In fact, mtscrape even prevents livestreams from being downloaded if they appear
# in the RSS feeds.
#
# To dump the livestream you can use mplayer directly:
#
#   mplayer \
#      -playlist http://wgeostreaming.zdf.de/encoder/livestream15_vh.asx \
#      -dumpstream \
#      -dumpfile zdf_stream.wmv
#
# == Requirements
#
# To use this script you need the following software installed on your system:
#
#   - Ruby-1.8.x (apt-get install ruby1.8 rubygems, emerge ruby, etc)
#   - Ruby-JSON (gem install json)
#   - LibXML-Ruby (gem install libxml-ruby)
#   - mplayer (apt-get install mplayer, emerge mplayer, etc)
#   - ffmpeg2theora (only for --convert)
#
# == License
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
#
# == Copyright
#
# 2007 Benedikt Boehm <hollow@gentoo.org>
#
# == ChangeLog
#
#  - Oct 26 2007: add --convert (v0.2)
#  - Oct 15 2007: initial release (v0.1)

require 'rubygems'
require 'date'
require 'getoptlong'
require 'json' # gem install json
require 'open-uri'
require 'rdoc/usage'
require 'rss/1.0'
require 'rss/2.0'
require 'xml/libxml' # gem install libxml-ruby

@age     = Time.now - (5 * 24 * 3600)
@convert = false
@bw      = 'dsl2000'
@dir     = '.'
@filter  = nil
@verbose = false

@asxs  = []
@cats  = []
@items = []
@links = []

opts = GetoptLong.new(
  [ '--age',      '-a', GetoptLong::REQUIRED_ARGUMENT ],
  [ '--convert',  '-c', GetoptLong::NO_ARGUMENT ],
  [ '--dir',      '-d', GetoptLong::REQUIRED_ARGUMENT ],
  [ '--fast',     '-f', GetoptLong::NO_ARGUMENT ],
  [ '--match',    '-m', GetoptLong::REQUIRED_ARGUMENT ],
  [ '--help',     '-h', GetoptLong::NO_ARGUMENT ],
  [ '--verbose',  '-v', GetoptLong::NO_ARGUMENT ],

  [ '--asx',      '-A', GetoptLong::REQUIRED_ARGUMENT ],
  [ '--category', '-C', GetoptLong::REQUIRED_ARGUMENT ],
  [ '--item',     '-I', GetoptLong::REQUIRED_ARGUMENT ],
  [ '--link',     '-L', GetoptLong::REQUIRED_ARGUMENT ]
)

def mt_parse_asx(link)
  data = ""
  open(link) do |s| data = s.read end
  XML::Parser.string(data).parse.root.find('Entry').first.find('Ref').first['href']
end

def mt_stream_url(link)
  data = ""
  open("#{link}?&bw=#{@bw}&pp=wmp&view=navJson") do |s| data = s.read end
  json = JSON.parse(data)
  return "INVALID:#{json['assetType']}" if json['assetType'] != "video"
  mt_parse_asx(json['assetUrl'])
end

@spinner = [ '-', '\\', '|', '/' ]

def wait(pid)
  if @verbose
    i = 0
    while Process::waitpid(pid, Process::WNOHANG) == nil
      print "\b#{@spinner[i]}"
      STDOUT.flush
      i = (i + 1) % 4
      sleep 0.05
    end

    puts "\b\b."
  else
    Process::waitpid(pid, 0)
  end
end

def dump_stream(url)
  match = /^INVALID:(.*)/.match(url)
  if match
    puts "[skip] this looks like a #{match[1]} source"
    return
  end

  basename = url.split("/").last
  outfile  = "#{@dir}/#{basename}"

  begin File::Stat.new(outfile)
    puts "[skip] File already exists (#{outfile})" if @verbose
  rescue
    cmd = "mplayer -nolirc -really-quiet -dumpstream -dumpfile '#{outfile}' '#{url}'"
    print "[dump] #{url} -> #{outfile}  " if @verbose
    wait(fork do system(cmd) end)
    convert_stream(outfile) if @convert
  end
end

def convert_stream(file)
  print "[conv] #{file}  " if @verbose
  wait(fork do system("ffmpeg2theora '#{file}'") end)
end

opts.each do |opt, arg|
  case opt
  when '--age'
    @age = Time.now - (arg.to_i * 24 * 3600)
  when '--convert'
    @convert = true
  when '--dir'
    @dir = arg
  when '--fast'
    @bw = 'dsl1000'
  when '--match'
    @filter = Regexp.new(arg, true, 'u')
  when '--help'
    RDoc::usage
    exit 0
  when '--verbose'
    @verbose = true

  when '--asx'
    @asxs << arg
  when '--category'
    @cats << arg.to_i
  when '--item'
    @items << arg.to_i
  when '--link'
    @links << arg
  end
end

@asxs.each do |asx|
  dump_stream(mt_parse_asx(asx))
end

@cats.each do |cat|
  rss = RSS::Parser.parse("http://www.zdf.de/ZDFmediathek/content/#{cat}?view=rss")
  rss.items.each do |item|
    if item.date >= @age
      puts "[test] #{item.title}" if @verbose
      match = @filter.nil? ? true : item.title =~ @filter
      dump_stream(mt_stream_url(item.link)) if match
    end
  end
end

@items.each do |item|
  dump_stream(mt_stream_url("http://www.zdf.de/ZDFmediathek/content/#{item}"))
end

@links.each do |link|
  dump_stream(mt_stream_url(link))
end