jasonsnell/sportinggreen.py

## sportinggreen.py
#! /usr/bin/env python3

import os
import requests
from bs4 import BeautifulSoup
import hashlib
import PyRSS2Gen
import datetime
import html.entities
from urllib.parse import urljoin
import json
from collections import deque

# Define the URL of the webpage to load
url = 'https://www.sfchronicle.com/sports/'

filepath = '/path/to/your/webserver/'
slug = 'sportinggreen'
feedTitle = 'Sporting Green - SF Chronicle'
links = set()

# Load previously saved links from file
try:
    with open(f'{filepath}{slug}.json', 'r') as f:
        json_links = json.load(f)
        links_set = set(map(tuple, json_links))
        for item in links_set:
            links.add(item)
        print('Found ' + str(len(links)) + ' previous items.')
except FileNotFoundError:
    print('no data file found.')

# Get current time
current_time = datetime.datetime.utcnow()

dupes = 0
nopes = 0

# Load the webpage content
response = requests.get(url)
html_str = response.content.decode('utf-8')  # Convert bytes to string

# Parse the HTML using BeautifulSoup
soup = BeautifulSoup(html_str, 'html.parser')

# Extract the title of the webpage
webpage_title = soup.title.string.strip()

# Extract the links and their linked text from the webpage
# This is SF Chronicle specific, you will need to adjust
for link in soup.find_all('a', class_='hdn-analytics'):
    href = link.get('href')
    # this is a Chronicle specific URL path, you may need to rejigger
    if href and '/article/' in href:
        href = urljoin(url, href)  # Prepend the domain to relative links

        if link.string:

            # check if the specific string is present in any of the sublists
            is_present = any(href in sublist for sublist in links)

            if is_present:
                dupes += 1
            else:
                nopes += 1
                title = link.string.strip()
                title = html.escape(title, quote=True)  # Escape special characters
                title = title.encode('ascii', 'xmlcharrefreplace').decode()  # Convert 8-bit characters to HTML entities
                timestamp = current_time.isoformat()
                links.add((href, title, timestamp))

print('Found ' + str(nopes) + ' new items and ' + str(dupes) + " duplicates.")

# Save unique links to file
links_list = list(links)
with open(f'{filepath}{slug}.json', 'w') as f:
    json.dump(links_list[-250:], f)


# Generate an RSS feed from the links
rss_items = []
for link, title, date in links:
    rss_item = PyRSS2Gen.RSSItem(
        title=title,
        link=link,
        guid=link,
        pubDate=date
    )
    rss_items.append(rss_item)

rss_feed = PyRSS2Gen.RSS2(
    title=feedTitle,
    link=url,
    description='RSS feed of the unique links on {}'.format(url),
    lastBuildDate=datetime.datetime.now(),
    items=rss_items,
)

# Save the RSS feed to a file
filename = f'{filepath}{slug}.rss'
with open(filename, 'w', encoding='utf-8') as f:
    rss_feed.write_xml(f)
	#! /usr/bin/env python3

	import os
	import requests
	from bs4 import BeautifulSoup
	import hashlib
	import PyRSS2Gen
	import datetime
	import html.entities
	from urllib.parse import urljoin
	import json
	from collections import deque

	# Define the URL of the webpage to load
	url = 'https://www.sfchronicle.com/sports/'

	filepath = '/path/to/your/webserver/'
	slug = 'sportinggreen'
	feedTitle = 'Sporting Green - SF Chronicle'
	links = set()

	# Load previously saved links from file
	try:
	with open(f'{filepath}{slug}.json', 'r') as f:
	json_links = json.load(f)
	links_set = set(map(tuple, json_links))
	for item in links_set:
	links.add(item)
	print('Found ' + str(len(links)) + ' previous items.')
	except FileNotFoundError:
	print('no data file found.')

	# Get current time
	current_time = datetime.datetime.utcnow()

	dupes = 0
	nopes = 0

	# Load the webpage content
	response = requests.get(url)
	html_str = response.content.decode('utf-8') # Convert bytes to string

	# Parse the HTML using BeautifulSoup
	soup = BeautifulSoup(html_str, 'html.parser')

	# Extract the title of the webpage
	webpage_title = soup.title.string.strip()

	# Extract the links and their linked text from the webpage
	# This is SF Chronicle specific, you will need to adjust
	for link in soup.find_all('a', class_='hdn-analytics'):
	href = link.get('href')
	# this is a Chronicle specific URL path, you may need to rejigger
	if href and '/article/' in href:
	href = urljoin(url, href) # Prepend the domain to relative links

	if link.string:

	# check if the specific string is present in any of the sublists
	is_present = any(href in sublist for sublist in links)

	if is_present:
	dupes += 1
	else:
	nopes += 1
	title = link.string.strip()
	title = html.escape(title, quote=True) # Escape special characters
	title = title.encode('ascii', 'xmlcharrefreplace').decode() # Convert 8-bit characters to HTML entities
	timestamp = current_time.isoformat()
	links.add((href, title, timestamp))

	print('Found ' + str(nopes) + ' new items and ' + str(dupes) + " duplicates.")

	# Save unique links to file
	links_list = list(links)
	with open(f'{filepath}{slug}.json', 'w') as f:
	json.dump(links_list[-250:], f)


	# Generate an RSS feed from the links
	rss_items = []
	for link, title, date in links:
	rss_item = PyRSS2Gen.RSSItem(
	title=title,
	link=link,
	guid=link,
	pubDate=date
	)
	rss_items.append(rss_item)

	rss_feed = PyRSS2Gen.RSS2(
	title=feedTitle,
	link=url,
	description='RSS feed of the unique links on {}'.format(url),
	lastBuildDate=datetime.datetime.now(),
	items=rss_items,
	)

	# Save the RSS feed to a file
	filename = f'{filepath}{slug}.rss'
	with open(filename, 'w', encoding='utf-8') as f:
	rss_feed.write_xml(f)