Skip to content

Instantly share code, notes, and snippets.

@mooware
Last active December 13, 2015 21:58
Show Gist options
  • Save mooware/4981372 to your computer and use it in GitHub Desktop.
Save mooware/4981372 to your computer and use it in GitHub Desktop.
Ruby script to scrape termine.orf.at for FM4 music events and turn the result into an RSS feed, with one post per run.
#!/usr/bin/env ruby
# Ruby script to scrape termine.orf.at for FM4 music events and
# turn the result into an RSS feed.
#
# Usage: ruby fm4-musik.rb <result-file> <data-file> [-digest]
#
# The result file is where the feed xml will be written to,
# and the data file stores some state necessary to track
# the events that were already posted.
# If the digest flag is set, all events found in one run
# will be written into one post, otherwise each events
# gets its own post.
# MIT LICENSE:
#
# Copyright (c) 2013 Markus Pointner
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
require 'rubygems'
require 'open-uri'
require 'nokogiri'
require 'rss/maker'
# --- config ---
# start date of the events to query
STARTDATE = Time.new
# end date of the events to query
ENDDATE = STARTDATE + (60 * 60 * 24 * 60) # + 60 days
# event category on the site
CATEGORY = "79" # music
# base url, also used as channel url for the rss feed
BASE_SITE = "http://termine.orf.at/fm4/index.php"
# first url to query, will also be used to fetch links to more pages
SITE = "#{BASE_SITE}?action=searchform&qtext=&countryId=&categoryId=#{CATEGORY}" +
"&startdate_date=#{STARTDATE.day}&startdate_month=#{STARTDATE.mon}&startdate_year=#{STARTDATE.year}" +
"&enddate_date=#{ENDDATE.day}&enddate_month=#{ENDDATE.mon}&enddate_year=#{ENDDATE.year}"
# --- end of config ---
# load the given url as nokogiri document
def load_document(url)
retries = 0
while true
# fetch the first page
body = open(url) { |io| io.read }
if body[0..100].include? "SQL ERROR" # this happened sometimes
if retries < 3
retries += 1
next
else
return nil
end
end
# sadly, this site delivers poorly structured html, so we try to fix it up
body.gsub!("</tr>\n <td", "</tr>\n<tr>\n <td")
return Nokogiri::HTML(body)
end
rescue
nil
end
# fetch a list of events from the given nokogiri document
def fetch_events(doc)
# generate a hash for each event
doc.css(".listEventMainCell").map do |event|
title_tag = event.at(".eventTitle")
title = title_tag.inner_html
location_tag = event.parent.parent.at(".eventLocation")
location = location_tag.inner_html
date = event.at(".eventDate").inner_html
# these links are quite tricky, there may not even be any link at all
event_link_tag = event.search(".eventDescription a").last ||
event.search(".eventUrl a").last
if event_link_tag
event_link = event_link_tag["href"]
elsif link and title
# improvise a unique title
event_link = link + "#" + title
else
next
end
# try to get an image
image_tag = event.parent.parent.at(".eventThumbnail img")
# throw out some garbage
# remove the "link" image
event.search("img") do |img|
img.swap "link" if img["src"].include? "link.gif"
end
# remove the little arrow before the date
event.search(".arrow").remove
# add breaks after title and description
title_tag.after("<br>") if title_tag
event.at(".eventDescription").after("<br>") if event.at(".eventDescription")
{
:link => event_link,
:date => date,
:title => title,
:location => location,
:html => location_tag.to_s + image_tag.to_s + event.inner_html
}
end
rescue StandardError => e
[]
end
# remove any items that were already posted, and remove posts which are too old
def filter_old_items(new_items, old_posts, time_limit)
# filter out any items that were already posted
new_items.reject! do |item|
old_posts.any? do |post|
post[:items].any? do |old_item|
old_item[:date] == item[:date] && old_item[:title] == item[:title]
end
end
end
# remove posts and items that are too old
old_posts.reject! { |post| post[:time] < time_limit }
[new_items, old_posts]
end
# create the rss feed xml
def make_feed(posts, encoding)
# add the new post to the feed
feed = RSS::Maker.make("2.0") do |m|
m.channel.title = "FM4 Termine - Musik"
m.channel.link = BASE_SITE
m.channel.description = "Konzerte von fm4.orf.at"
m.items.do_sort = true # sort items by date
posts.each do |p|
post = m.items.new_item
post.title = p[:title]
post.link = p[:link]
post.date = p[:time]
post.description = p[:html]
end
end
# fix the feed encoding
feed.to_s.sub!(/encoding="[^"]+"/, "encoding=\"#{encoding}\"")
end
# convert an item hash to a line of html for the feed post
def item_to_html(item)
"<ul><a href=\"#{item[:link]}\">#{item[:date]} - #{item[:title]}</a>&nbsp;(#{item[:location]})</ul>"
end
# convert a list of item hashes into a single feed post hash
def make_digest_post(items, postdate, postlink)
# create the html for the new post
html = items.map { |item| item_to_html(item) }.join("\n")
# make a post hash
{
:title => "FM4 Termine - #{postdate.strftime("%a %b %d %Y")}",
:link => postlink,
:items => items,
:time => postdate,
:html => html
}
end
# convert a list of item hashes into a list of event feed post hashes
def make_event_posts(items, postdate)
items.map do |item|
# use the same keys as for a digest post, to keep it compatible
{
:title => "#{item[:date]} - #{item[:title]}",
:link => item[:link],
:items => [item],
:time => postdate,
:html => item[:html]
}
end
end
# do main stuff
def main
abort "Usage: #{File.basename($PROGRAM_NAME)} <result-file> <dump-file> [-digest]" if ARGV.count < 2
resultfile = ARGV.shift
dumpfile = ARGV.shift
is_digest = (ARGV.shift == "-digest")
# fetch the first page, get the links to the other pages
puts "loading \"#{SITE}\""
doc = load_document(SITE)
abort "failed to load \"#{SITE}\"" if doc.nil?
charset = doc.search("head meta").first[:content].match(/charset=(.+)/).captures.first
links = doc.search("#listPageNavigTop a").map { |a| BASE_SITE + a["href"] }
links.uniq!
puts "found #{links.count} links"
# we will use the first element as a dummy to process the first page
links.unshift nil
# fetch all events
items = links.map do |link|
doc = load_document(link) if link
fetch_events(doc) if doc
end
# clean up the item list
items.flatten!
items.compact!
abort "no events found" if items.empty?
puts "found #{items.count} events"
# get data from previous scrapes
posts = []
if File.exist?(dumpfile)
posts = Marshal.load(File.read(dumpfile))
puts "loaded #{posts.count} old posts"
end
# filter out any items that were already posted,
# and remove posts and items that are too old
datediff = ENDDATE - STARTDATE
olddate = STARTDATE - datediff
items, posts = filter_old_items(items, posts, olddate)
puts "#{items.count} events after filtering"
if items.empty?
puts "no new events, exiting"
exit 0
end
# add the new post(s), create the feed
if is_digest
posts << make_digest_post(items, STARTDATE, SITE)
else
posts += make_event_posts(items, STARTDATE)
end
puts "writing #{posts.count} posts"
feedxml = make_feed(posts, charset)
# write the feed to an xml file
puts "writing feed file \"#{resultfile}\""
File.open(resultfile, "w") do |io|
io << feedxml
end
# store the scraped data for next time
File.open(dumpfile, "w") do |io|
io << Marshal.dump(posts)
end
end
if __FILE__ == $0
main
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment