henrik/eyetv_xmltv_grabber.rb

## eyetv_xmltv_grabber.rb
#!/usr/bin/env ruby

# SweDB XMLTV Grabber for EyeTV
# by Henrik Nyh <http://henrik.nyh.se> under the MIT License.
#
# INSTALLATION
#
# Configure the list of channels below and run this script as a cron job, e.g.
#
#   0 */12 * * * ruby /path/to/this/script.rb
# to run it every 12 hours (suggested since cron jobs don't run if the computer is off).
#
# Every time it runs it will get schedules for those channels and import them into EyeTV.
# The script uses conditional GET to only update files if they've changed.

# If you're on OS X Mavericks, you probably need to run this for the script to work:
#
# xcode-select --install
# sudo gem install hpricot
#
# If you're on Lion or Mountain Lion, install Xcode and its command-line tools and then run:
#
# sudo gem install hpricot
#
# On Snow Leopard, it should just work.
#
# CHANGELOG
#
# 2015-06-17 1.6: New root URL.
# 2013-10-25 1.5: Unbreak on Mavericks.
# 2012-01-14 1.4: cron example is once per 12 hours, not once a minute every 12th hour.
# 2009-05-05 1.3: MIT License.
# 2008-10-30 1.2: EyeTV no longer steals focus on import.
# 2008-09-15 1.1: Always update channel list; not just if updated on server. Don't break on 404.
# 2008-03-09 1.0: First version.

# Use ids as listed in http://xmltv.tvsajten.com/channels.xml.gz.
CHANNELS = %w[
  kanal9.se
  hd.svt.se
  history.viasat.se
  world.discovery.com
  nordic.travel.discovery.com
  ztv.se
  nature.viasat.se
  showtime.nonstop.tv
  silver.nonstop.tv
]


%w[zlib open-uri date fileutils rubygems hpricot].each {|lib| require lib }

DAYS        = 15  # A value from 1 to 15. Fetches yesterday + n days.
                  # We need yesterday for today's early morning shows.

ROOT_URL    = "http://xmltv.tvsajten.com/channels.xml.gz"
USER_AGENT  = "SweDB XMLTV Grabber for EyeTV/1.6 <henrik@nyh.se>"

DIRECTORY   = "/Library/Application Support/EyeTV/XMLTV"
DB          = File.join(DIRECTORY, "condget.db")

SCHEDULE_RE = /_(\d{4}-\d{2}-\d{2})\.xml$/
YESTERDAY   = Date.today - 1


# Load or create DB hash mapping file paths to Conditional GET modification dates.
# Using file paths instead of URLs since this makes removing out-of-date entries easier.
$db = File.open(DB) {|file| Marshal.load(file) } rescue Hash.new("")


# This method does conditional GET based on Last-Modified/If-Modified-Since only (no etag)
# and assumes gzipped content.
# Returns local file path if new content was retrieved, false otherwise.
def condget(url)
  filename = File.basename(url, ".gz")
  path = path_from_url(url)
  open(url, "If-Modified-Since" => $db[path], "User-Agent" => USER_AGENT) do |response|
    # Store last-modified
    $db[path] = response.meta["last-modified"]

    # Gunzip
    begin
      response_body = Zlib::GzipReader.new(response).read
    rescue Zlib::GzipFile::Error
      response.rewind
      response_body = response.read
    end

    File.open(path, 'w') {|file| file.write response_body }
    path
  end
rescue OpenURI::HTTPError => e
  raise unless e.message =~ /^(304|404)/
  false
end

def path_from_url(url)
  filename = File.basename(url, ".gz")
  File.join(DIRECTORY, filename)
end


# Remove expired schedules

glob = File.join(DIRECTORY, "*.xml")
expired_files = Dir[glob].select {|file| file =~ SCHEDULE_RE && Date.parse($1) < YESTERDAY }
expired_files.each do |file|
  File.delete(file)
  $db.delete(file)
end


# Update scheduling

updates = []

# Create directory structure if it doesn't exist already
FileUtils.mkdir_p DIRECTORY

# Get and parse root file
new_root_file = condget(ROOT_URL)
doc = Hpricot(open(path_from_url(ROOT_URL)))

# Create active_channels.xml file containing only the requested channels
excluded_channels = doc.search('channel').reject {|channel| CHANNELS.include?(channel[:id]) }
Hpricot::Elements[*excluded_channels].remove
active_channels_file = File.join(DIRECTORY, "active_channels.xml")
active_channels_xml = doc.to_s.gsub(/\n\s+\n/, "\n")  # get rid of excess whitespace
File.open(active_channels_file, "w") {|file| file.write active_channels_xml }

updates << active_channels_file

# Loop over channels and get schedules
channels = doc.search('channel').map {|channel| [channel[:id], channel.at('base-url').inner_text] }
channels.each do |channel|
  id, base_url = channel

  (YESTERDAY).upto(YESTERDAY + DAYS) do |day|
    filename = "#{id}_#{day}.xml.gz"
    url = File.join(base_url, filename)
    if path = condget(url)
      updates << path
    end
  end
end


# Persist the conditional GET DB hash to disk
File.open(DB, "w") {|file| Marshal.dump($db, file) }

# Do the actual EyeTV import
command = %w[open -ga EyeTV] + updates
system(*command)
	#!/usr/bin/env ruby

	# SweDB XMLTV Grabber for EyeTV
	# by Henrik Nyh <http://henrik.nyh.se> under the MIT License.
	#
	# INSTALLATION
	#
	# Configure the list of channels below and run this script as a cron job, e.g.
	#
	# 0 /12 * * ruby /path/to/this/script.rb
	# to run it every 12 hours (suggested since cron jobs don't run if the computer is off).
	#
	# Every time it runs it will get schedules for those channels and import them into EyeTV.
	# The script uses conditional GET to only update files if they've changed.

	# If you're on OS X Mavericks, you probably need to run this for the script to work:
	#
	# xcode-select --install
	# sudo gem install hpricot
	#
	# If you're on Lion or Mountain Lion, install Xcode and its command-line tools and then run:
	#
	# sudo gem install hpricot
	#
	# On Snow Leopard, it should just work.
	#
	# CHANGELOG
	#
	# 2015-06-17 1.6: New root URL.
	# 2013-10-25 1.5: Unbreak on Mavericks.
	# 2012-01-14 1.4: cron example is once per 12 hours, not once a minute every 12th hour.
	# 2009-05-05 1.3: MIT License.
	# 2008-10-30 1.2: EyeTV no longer steals focus on import.
	# 2008-09-15 1.1: Always update channel list; not just if updated on server. Don't break on 404.
	# 2008-03-09 1.0: First version.

	# Use ids as listed in http://xmltv.tvsajten.com/channels.xml.gz.
	CHANNELS = %w[
	kanal9.se
	hd.svt.se
	history.viasat.se
	world.discovery.com
	nordic.travel.discovery.com
	ztv.se
	nature.viasat.se
	showtime.nonstop.tv
	silver.nonstop.tv
	]


	%w[zlib open-uri date fileutils rubygems hpricot].each {\|lib\| require lib }

	DAYS = 15 # A value from 1 to 15. Fetches yesterday + n days.
	# We need yesterday for today's early morning shows.

	ROOT_URL = "http://xmltv.tvsajten.com/channels.xml.gz"
	USER_AGENT = "SweDB XMLTV Grabber for EyeTV/1.6 <henrik@nyh.se>"

	DIRECTORY = "/Library/Application Support/EyeTV/XMLTV"
	DB = File.join(DIRECTORY, "condget.db")

	SCHEDULE_RE = /_(\d{4}-\d{2}-\d{2})\.xml$/
	YESTERDAY = Date.today - 1


	# Load or create DB hash mapping file paths to Conditional GET modification dates.
	# Using file paths instead of URLs since this makes removing out-of-date entries easier.
	$db = File.open(DB) {\|file\| Marshal.load(file) } rescue Hash.new("")


	# This method does conditional GET based on Last-Modified/If-Modified-Since only (no etag)
	# and assumes gzipped content.
	# Returns local file path if new content was retrieved, false otherwise.
	def condget(url)
	filename = File.basename(url, ".gz")
	path = path_from_url(url)
	open(url, "If-Modified-Since" => $db[path], "User-Agent" => USER_AGENT) do \|response\|
	# Store last-modified
	$db[path] = response.meta["last-modified"]

	# Gunzip
	begin
	response_body = Zlib::GzipReader.new(response).read
	rescue Zlib::GzipFile::Error
	response.rewind
	response_body = response.read
	end

	File.open(path, 'w') {\|file\| file.write response_body }
	path
	end
	rescue OpenURI::HTTPError => e
	raise unless e.message =~ /^(304\|404)/
	false
	end

	def path_from_url(url)
	filename = File.basename(url, ".gz")
	File.join(DIRECTORY, filename)
	end


	# Remove expired schedules

	glob = File.join(DIRECTORY, "*.xml")
	expired_files = Dir[glob].select {\|file\| file =~ SCHEDULE_RE && Date.parse($1) < YESTERDAY }
	expired_files.each do \|file\|
	File.delete(file)
	$db.delete(file)
	end


	# Update scheduling

	updates = []

	# Create directory structure if it doesn't exist already
	FileUtils.mkdir_p DIRECTORY

	# Get and parse root file
	new_root_file = condget(ROOT_URL)
	doc = Hpricot(open(path_from_url(ROOT_URL)))

	# Create active_channels.xml file containing only the requested channels
	excluded_channels = doc.search('channel').reject {\|channel\| CHANNELS.include?(channel[:id]) }
	Hpricot::Elements[*excluded_channels].remove
	active_channels_file = File.join(DIRECTORY, "active_channels.xml")
	active_channels_xml = doc.to_s.gsub(/\n\s+\n/, "\n") # get rid of excess whitespace
	File.open(active_channels_file, "w") {\|file\| file.write active_channels_xml }

	updates << active_channels_file

	# Loop over channels and get schedules
	channels = doc.search('channel').map {\|channel\| [channel[:id], channel.at('base-url').inner_text] }
	channels.each do \|channel\|
	id, base_url = channel

	(YESTERDAY).upto(YESTERDAY + DAYS) do \|day\|
	filename = "#{id}_#{day}.xml.gz"
	url = File.join(base_url, filename)
	if path = condget(url)
	updates << path
	end
	end
	end


	# Persist the conditional GET DB hash to disk
	File.open(DB, "w") {\|file\| Marshal.dump($db, file) }

	# Do the actual EyeTV import
	command = %w[open -ga EyeTV] + updates
	system(*command)