Skip to content

Instantly share code, notes, and snippets.

@lowweihong
Created April 10, 2020 06:40
Show Gist options
  • Save lowweihong/94f6cf51c91c65eed83a0f56a68a19a7 to your computer and use it in GitHub Desktop.
Save lowweihong/94f6cf51c91c65eed83a0f56a68a19a7 to your computer and use it in GitHub Desktop.
import feedparser
from pprint import pprint
from bs4 import BeautifulSoup
url = "http://news.google.com/news?q=covid-19&hl=en-US&sort=date&gl=US&num=100&output=rss"
class ParseFeed():
def __init__(self, url):
self.feed_url = url
def clean(self, html):
'''
Get the text from html and do some cleaning
'''
soup = BeautifulSoup(html)
text = soup.get_text()
text = text.replace('\xa0', ' ')
return text
def parse(self):
'''
Parse the URL, and print all the details of the news
'''
feeds = feedparser.parse(self.feed_url).entries
for f in feeds:
pprint({
'Description': self.clean(f.get("description", "")),
'Published Date': f.get("published", ""),
'Title': f.get("title", ""),
'Url': f.get("link", "")
})
feed = ParseFeed(url)
feed.parse()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment