Created
March 18, 2023 19:11
-
-
Save looselycoupled/54c313e75d2966e3c2e2f2d1e42e62ec to your computer and use it in GitHub Desktop.
rss solution
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Template for RSS exercise | |
""" | |
########################################################################## | |
## Imports | |
########################################################################## | |
import os | |
import re | |
from pprint import pprint | |
import requests | |
import feedparser | |
########################################################################## | |
## Module Variables/Constants | |
########################################################################## | |
RSS_URL = 'http://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml' | |
########################################################################## | |
## Functions | |
########################################################################## | |
def slugify(value): | |
""" | |
Converts to ASCII. Converts spaces to hyphens. Removes characters that | |
aren't alphanumerics, underscores, or hyphens. Converts to lowercase. | |
Also strips leading and trailing whitespace. | |
In short, this function converts text into a form safe to use as a | |
filename. | |
""" | |
value = value.encode('ascii', 'ignore').decode('ascii') | |
value = re.sub('[^\w\s-]', '', value).strip().lower() | |
return re.sub('[-\s]+', '-', value) | |
def save_article(title, content): | |
""" | |
Save HTML content using a slugged version of the title as the basis for | |
the filename | |
""" | |
filename = slugify(title) + ".html" | |
with open("articles/" + filename, "w") as f: | |
f.write(content) | |
def main(): | |
""" | |
Main execution | |
""" | |
# grab RSS data and parse it | |
stuff = feedparser.parse(RSS_URL) | |
entries = stuff["entries"] | |
# use pprint (pretty print) to print the formatted version of a variable | |
# if you would like to better understand what it contains | |
# pprint(SOME_PYTHON_VARIABLE) | |
# loop through each article/RSS item | |
for entry in entries: | |
# fetch the article using its url and include the header dictionary shown here: | |
# requests.get(URL_HERE, headers={"user-agent":"curl/7.86.0"}) | |
response = requests.get(entry["link"]) | |
content = response.text | |
title = entry["title"] | |
# save to disk or print an error message | |
# hint: save_article(content) | |
save_article(title, content) | |
########################################################################## | |
## Execution | |
########################################################################## | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment