Skip to content

Instantly share code, notes, and snippets.

@Dan-Q
Created February 3, 2025 20:41
"BBC News... without the crap" RSS feed generators 2025 | https://danq.me/2025/02/03/bbc-news-rss-improved/
#!/usr/bin/env ruby
require 'bundler/inline'
# Dependencies:
# * open-uri - load remote URL content easily
# * nokogiri - parse/filter XML
gemfile do
source 'https://rubygems.org'
gem 'nokogiri'
end
require 'open-uri'
# Regular expression describing the GUIDs to reject from the resulting RSS feed
# We want to drop iPlayer/Sounds/Ideas links but retain sports for some reason
REJECT_GUIDS_MATCHING = /^https:\/\/www\.bbc\.(co\.uk|com)\/(iplayer|sounds|ideas|news\/videos|programmes)\//
REJECT_TITLES_MATCHING = /^(BBC News app)$/
# Load and filter the original RSS
rss = Nokogiri::XML(open('https://feeds.bbci.co.uk/news/rss.xml?edition=uk'))
rss.css('item').select{|item|
item.css('guid').text =~ REJECT_GUIDS_MATCHING ||
item.css('title').text =~ REJECT_TITLES_MATCHING
}.each{|item| item.swap("<!-- [REJECTED] #{item.to_s.gsub(/--/, '[hyphen][hyphen]')} -->")}
# Strip the anchors off the <guid>s: BBC News "republishes" stories by using guids with #0, #1, #2 etc, which results in duplicates in feed readers
rss.css('guid').each{|g|g.content=g.content.gsub(/#.*$/,'')}
# Now there might be duplicate <guid>s, which is usually harmless but isn't pretty (and violates the spec). Let's remove the dupes.
rss.css('guid').map(&:text).each do |guid|
matching_items = rss.css('item').select{|item| item.css('guid').text == guid }
duplicate_items = matching_items[1..-1]
duplicate_items.each{|item| item.swap("<!-- [DUPLICATE] #{item.to_s.gsub(/--/, '[hyphen][hyphen]')} -->")}
end
# Tag us as the generator
generator = rss.css('generator')[0]
generator.content = "Dan Q's 'BBC News without the crap but with the sport' <https://danq.me/2024/03/09/bbc-news-without-the-crap/> generator. Was: #{generator.content}"
# Update the src to us:
rss.xpath('//atom:link').attr('href', '[URL TO WHERE YOU PLAN TO HOST THE FILE]')
File.open( 'bbc-news-with-sport.xml', 'w' ){ |f| f.puts(rss.to_s) }
#!/usr/bin/env ruby
require 'bundler/inline'
# Dependencies:
# * open-uri - load remote URL content easily
# * nokogiri - parse/filter XML
gemfile do
source 'https://rubygems.org'
gem 'nokogiri'
end
require 'open-uri'
# Regular expression describing the GUIDs to reject from the resulting RSS feed
# We want to drop everything from the "sport" section of the website, also any iPlayer/Sounds/Ideas links
REJECT_GUIDS_MATCHING = /^https:\/\/www\.bbc\.(co\.uk|com)\/(sport|iplayer|sounds|ideas|news\/videos|programmes)\//
REJECT_TITLES_MATCHING = /^(BBC News app)$/
# Load and filter the original RSS
rss = Nokogiri::XML(open('https://feeds.bbci.co.uk/news/rss.xml?edition=uk'))
rss.css('item').select{|item|
item.css('guid').text =~ REJECT_GUIDS_MATCHING ||
item.css('title').text =~ REJECT_TITLES_MATCHING
}.each{|item| item.swap("<!-- [REJECTED] #{item.to_s.gsub(/--/, '[hyphen][hyphen]')} -->")}
# Strip the anchors off the <guid>s: BBC News "republishes" stories by using guids with #0, #1, #2 etc, which results in duplicates in feed readers
rss.css('guid').each{|g|g.content=g.content.gsub(/#.*$/,'')}
# Now there might be duplicate <guid>s, which is usually harmless but isn't pretty (and violates the spec). Let's remove the dupes.
rss.css('guid').map(&:text).each do |guid|
matching_items = rss.css('item').select{|item| item.css('guid').text == guid }
duplicate_items = matching_items[1..-1]
duplicate_items.each{|item| item.swap("<!-- [DUPLICATE] #{item.to_s.gsub(/--/, '[hyphen][hyphen]')} -->")}
end
# Tag us as the generator
generator = rss.css('generator')[0]
generator.content = "Dan Q's 'BBC News without the crap' <https://danq.me/2025/02/03/bbc-news-rss-improved/ ‎> generator. Was: #{generator.content}"
# Update the src to us:
rss.xpath('//atom:link').attr('href', '[URL TO WHERE YOU PLAN TO HOST THE FILE]')
File.open( 'bbc-news-no-sport.xml', 'w' ){ |f| f.puts(rss.to_s) }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment