Skip to content

Instantly share code, notes, and snippets.

@jasonsnell
Last active May 20, 2023 02:08
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jasonsnell/e8578f4a41ee9633242bb8f25e26bf1c to your computer and use it in GitHub Desktop.
Save jasonsnell/e8578f4a41ee9633242bb8f25e26bf1c to your computer and use it in GitHub Desktop.
Yet Another Web to RSS Scraper
#! /usr/bin/env python3
import os
import requests
from bs4 import BeautifulSoup
import hashlib
import PyRSS2Gen
import datetime
import html.entities
from urllib.parse import urljoin
import json
from collections import deque
# Define the URL of the webpage to load
url = 'https://www.sfchronicle.com/sports/'
filepath = '/path/to/your/webserver/'
slug = 'sportinggreen'
feedTitle = 'Sporting Green - SF Chronicle'
links = set()
# Load previously saved links from file
try:
with open(f'{filepath}{slug}.json', 'r') as f:
json_links = json.load(f)
links_set = set(map(tuple, json_links))
for item in links_set:
links.add(item)
print('Found ' + str(len(links)) + ' previous items.')
except FileNotFoundError:
print('no data file found.')
# Get current time
current_time = datetime.datetime.utcnow()
dupes = 0
nopes = 0
# Load the webpage content
response = requests.get(url)
html_str = response.content.decode('utf-8') # Convert bytes to string
# Parse the HTML using BeautifulSoup
soup = BeautifulSoup(html_str, 'html.parser')
# Extract the title of the webpage
webpage_title = soup.title.string.strip()
# Extract the links and their linked text from the webpage
# This is SF Chronicle specific, you will need to adjust
for link in soup.find_all('a', class_='hdn-analytics'):
href = link.get('href')
# this is a Chronicle specific URL path, you may need to rejigger
if href and '/article/' in href:
href = urljoin(url, href) # Prepend the domain to relative links
if link.string:
# check if the specific string is present in any of the sublists
is_present = any(href in sublist for sublist in links)
if is_present:
dupes += 1
else:
nopes += 1
title = link.string.strip()
title = html.escape(title, quote=True) # Escape special characters
title = title.encode('ascii', 'xmlcharrefreplace').decode() # Convert 8-bit characters to HTML entities
timestamp = current_time.isoformat()
links.add((href, title, timestamp))
print('Found ' + str(nopes) + ' new items and ' + str(dupes) + " duplicates.")
# Save unique links to file
links_list = list(links)
with open(f'{filepath}{slug}.json', 'w') as f:
json.dump(links_list[-250:], f)
# Generate an RSS feed from the links
rss_items = []
for link, title, date in links:
rss_item = PyRSS2Gen.RSSItem(
title=title,
link=link,
guid=link,
pubDate=date
)
rss_items.append(rss_item)
rss_feed = PyRSS2Gen.RSS2(
title=feedTitle,
link=url,
description='RSS feed of the unique links on {}'.format(url),
lastBuildDate=datetime.datetime.now(),
items=rss_items,
)
# Save the RSS feed to a file
filename = f'{filepath}{slug}.rss'
with open(filename, 'w', encoding='utf-8') as f:
rss_feed.write_xml(f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment