Filter the Sport section out of the BBC News RSS feed and put the result into a Backblaze B2 bucket
#!/usr/bin/env ruby | |
# # Sample crontab: | |
# # At 41 minutes past each hour, run the script and log the results | |
# 41 * * * * ~/bbc-news-rss-filter-sport-out.rb > ~/bbc-news-rss-filter-sport-out.log 2>>&1 | |
# Dependencies: | |
# * open-uri - load remote URL content easily | |
# * nokogiri - parse/filter XML | |
# * b2 - command line tools, described below | |
require 'bundler/inline' | |
gemfile do | |
source 'https://rubygems.org' | |
gem 'nokogiri' | |
end | |
require 'open-uri' | |
# Regular expression describing the GUIDs to reject from the resulting RSS feed | |
# We want to drop everything from the "sport" section of the website | |
REJECT_GUIDS_MATCHING = /^https:\/\/www\.bbc\.co\.uk\/sport\// | |
# Assumption: you're set up with a Backblaze B2 account with a bucket to which | |
# you'd like to upload the resulting RSS file, and you've configured the 'b2' | |
# command-line tool (https://www.backblaze.com/b2/docs/b2_authorize_account.html) | |
B2_BUCKET = 'YOUR-BUCKET-NAME-GOES-HERE' | |
B2_FILENAME = 'bbc-news-nosport.rss' | |
# Load and filter the original RSS | |
rss = Nokogiri::XML(open('https://feeds.bbci.co.uk/news/rss.xml?edition=uk')) | |
rss.css('item').select{|item| item.css('guid').text =~ REJECT_GUIDS_MATCHING }.each(&:unlink) | |
begin | |
# Output resulting filtered RSS into a temporary file | |
temp_file = Tempfile.new | |
temp_file.write(rss.to_s) | |
temp_file.close | |
# Upload filtered RSS to a Backblaze B2 bucket | |
result = `b2 upload_file --noProgress --contentType application/rss+xml #{B2_BUCKET} #{temp_file.path} #{B2_FILENAME}` | |
puts Time.now | |
puts result.split("\n").select{|line| line =~ /^URL by file name:/}.join("\n") | |
ensure | |
# Tidy up after ourselves by ensuring we delete the temporary file | |
temp_file.close | |
temp_file.unlink | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment