Skip to content

Instantly share code, notes, and snippets.

@johnowhitaker
Created December 30, 2022 18:11
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save johnowhitaker/741e9c9f16f7eb9c879bc0dbcd780b59 to your computer and use it in GitHub Desktop.
Save johnowhitaker/741e9c9f16f7eb9c879bc0dbcd780b59 to your computer and use it in GitHub Desktop.
app to summarize an RSS feed and write to a new RSS feed
import trafilatura
import feedparser
import requests
from bs4 import BeautifulSoup
from feedgenerator import DefaultFeed, Enclosure
API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-cnn"
headers = {"Authorization": "Bearer HF_TOKEN"}
def query(payload):
response = requests.post(API_URL, headers=headers, json=payload)
return response.json()
def summarize(text):
if text is None: return None
output = query({
"inputs": text, # TODO pick max size
"max_length":300,
"min_length":30,
"do_sample":False
})
return output[0]['summary_text']
HN_Feed = feedparser.parse('https://hnrss.org/frontpage')
Prev_Feed = feedparser.parse('feed.xml')
Out_Feed = DefaultFeed(
title="DistilHN Feed",
link="http://example.com/rss",
description="Front Page articles from HN, sumarized with AI"
)
for p in HN_Feed.entries:
print(p['title'])
im_url = 'https://news.ycombinator.com/favicon.ico'
if not 'ycombinator' in p['link']:
im_url = 'https://placekitten.com/g/300/200'
# Summarize
summary = p['summary']
if 'Article URL' in summary:
try:
summary = "Summary failed. Article URL: " + p['link']
downloaded = trafilatura.fetch_url(p['link'])
text = trafilatura.extract(downloaded, include_comments=False, include_tables=False)
if text is not None:
# Get summary:
summary = summarize(text)
# Get image URL
soup = BeautifulSoup(downloaded, 'html.parser')
im = soup.find("meta", property="og:image")
im_url = im['content'] if im else im_url
except:
summary = "Summary failed. Article URL: " + p['link']
im_url = 'None'
else:
# Truncate & Remove HTML (for askHN and similar)
if len(summary)>10: summary = trafilatura.extract(summary)[:240] + '...'
# Special rules
# YouTube
if 'youtube' in p['link']:
summary = "YouTube Video: " + p['link']
im_url = 'None'
# Mastadon
if 'mastodon' in p['link'] or 'mastadon' in p.summary:
summary = "Mastadon Post: " + p['link']
im_url = 'None'
# Twitter
if 'twitter' in p['link']:
summary = "Twitter Post: " + p['link']
im_url = 'None'
# Add to feed
Out_Feed.add_item(
title=p['title'],
link=p['link'],
description=summary,
comments = p['comments'],
enclosure = Enclosure(im_url, '1234', 'image/jpeg'),
)
# Generate the RSS feed XML
rss = Out_Feed.writeString('utf-8')
# Save the RSS feed to a file
with open('feed.xml', 'w') as f:
f.write(rss)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment