johnowhitaker/make_rss.py

## make_rss.py
import trafilatura
import feedparser
import requests
from bs4 import BeautifulSoup
from feedgenerator import DefaultFeed, Enclosure


API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-cnn"
headers = {"Authorization": "Bearer HF_TOKEN"}

def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

def summarize(text):
    if text is None: return None
    output = query({
        "inputs": text, # TODO pick max size
            "max_length":300,
            "min_length":30,
            "do_sample":False
    })
    return output[0]['summary_text']

HN_Feed = feedparser.parse('https://hnrss.org/frontpage')
Prev_Feed = feedparser.parse('feed.xml')
Out_Feed = DefaultFeed(
    title="DistilHN Feed",
    link="http://example.com/rss",
    description="Front Page articles from HN, sumarized with AI"
)

for p in HN_Feed.entries:

    print(p['title'])

    im_url = 'https://news.ycombinator.com/favicon.ico'
    if not 'ycombinator' in p['link']:
        im_url = 'https://placekitten.com/g/300/200'

    # Summarize
    summary = p['summary']
    if 'Article URL' in summary:
        try:
            summary = "Summary failed. Article URL: " + p['link']
            downloaded = trafilatura.fetch_url(p['link'])
            text =  trafilatura.extract(downloaded, include_comments=False, include_tables=False)
            if text is not None:

                # Get summary:
                summary = summarize(text)

                # Get image URL
                soup = BeautifulSoup(downloaded, 'html.parser')
                im = soup.find("meta", property="og:image")
                im_url = im['content'] if im else im_url
        except:
            summary = "Summary failed. Article URL: " + p['link']
            im_url = 'None'
    else:
        # Truncate & Remove HTML (for askHN and similar)
        if len(summary)>10: summary = trafilatura.extract(summary)[:240] + '...'

    # Special rules

    # YouTube
    if 'youtube' in p['link']:
        summary = "YouTube Video: " + p['link']
        im_url = 'None'

    # Mastadon
    if 'mastodon' in p['link'] or 'mastadon' in p.summary:
        summary = "Mastadon Post: " + p['link']
        im_url = 'None'

    # Twitter
    if 'twitter' in p['link']:
        summary = "Twitter Post: " + p['link']
        im_url = 'None'

    # Add to feed
    Out_Feed.add_item(
        title=p['title'],
        link=p['link'],
        description=summary,
        comments = p['comments'],
        enclosure = Enclosure(im_url, '1234', 'image/jpeg'),
    )

# Generate the RSS feed XML
rss = Out_Feed.writeString('utf-8')

# Save the RSS feed to a file
with open('feed.xml', 'w') as f:
    f.write(rss)
	import trafilatura
	import feedparser
	import requests
	from bs4 import BeautifulSoup
	from feedgenerator import DefaultFeed, Enclosure


	API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-cnn"
	headers = {"Authorization": "Bearer HF_TOKEN"}

	def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()

	def summarize(text):
	if text is None: return None
	output = query({
	"inputs": text, # TODO pick max size
	"max_length":300,
	"min_length":30,
	"do_sample":False
	})
	return output[0]['summary_text']

	HN_Feed = feedparser.parse('https://hnrss.org/frontpage')
	Prev_Feed = feedparser.parse('feed.xml')
	Out_Feed = DefaultFeed(
	title="DistilHN Feed",
	link="http://example.com/rss",
	description="Front Page articles from HN, sumarized with AI"
	)

	for p in HN_Feed.entries:

	print(p['title'])

	im_url = 'https://news.ycombinator.com/favicon.ico'
	if not 'ycombinator' in p['link']:
	im_url = 'https://placekitten.com/g/300/200'

	# Summarize
	summary = p['summary']
	if 'Article URL' in summary:
	try:
	summary = "Summary failed. Article URL: " + p['link']
	downloaded = trafilatura.fetch_url(p['link'])
	text = trafilatura.extract(downloaded, include_comments=False, include_tables=False)
	if text is not None:

	# Get summary:
	summary = summarize(text)

	# Get image URL
	soup = BeautifulSoup(downloaded, 'html.parser')
	im = soup.find("meta", property="og:image")
	im_url = im['content'] if im else im_url
	except:
	summary = "Summary failed. Article URL: " + p['link']
	im_url = 'None'
	else:
	# Truncate & Remove HTML (for askHN and similar)
	if len(summary)>10: summary = trafilatura.extract(summary)[:240] + '...'

	# Special rules

	# YouTube
	if 'youtube' in p['link']:
	summary = "YouTube Video: " + p['link']
	im_url = 'None'

	# Mastadon
	if 'mastodon' in p['link'] or 'mastadon' in p.summary:
	summary = "Mastadon Post: " + p['link']
	im_url = 'None'

	# Twitter
	if 'twitter' in p['link']:
	summary = "Twitter Post: " + p['link']
	im_url = 'None'

	# Add to feed
	Out_Feed.add_item(
	title=p['title'],
	link=p['link'],
	description=summary,
	comments = p['comments'],
	enclosure = Enclosure(im_url, '1234', 'image/jpeg'),
	)

	# Generate the RSS feed XML
	rss = Out_Feed.writeString('utf-8')

	# Save the RSS feed to a file
	with open('feed.xml', 'w') as f:
	f.write(rss)