Last active
June 17, 2017 04:31
-
-
Save looselycoupled/fe2f22979dde458a7c56 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Model answer for RSS exercise | |
NOTE: In order to use this program, be sure to create a sub-directory | |
called 'articles' | |
""" | |
########################################################################## | |
## Imports | |
########################################################################## | |
import os | |
import re | |
import requests | |
import feedparser | |
########################################################################## | |
## Module Variables/Constants | |
########################################################################## | |
RSS_URL = 'http://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml' | |
########################################################################## | |
## Functions | |
########################################################################## | |
def slugify(value): | |
""" | |
Converts to ASCII. Converts spaces to hyphens. Removes characters that | |
aren't alphanumerics, underscores, or hyphens. Converts to lowercase. | |
Also strips leading and trailing whitespace. | |
Note: This is not production code | |
""" | |
value = value.encode('ascii', 'ignore').decode('ascii') | |
value = re.sub('[^\w\s-]', '', value).strip().lower() | |
return re.sub('[-\s]+', '-', value) | |
def save_article(title, content): | |
""" | |
Save HTML content using a slugged version of the title as the basis for | |
the filename | |
""" | |
filename = '%s.html' % slugify(title) | |
path = os.path.join(os.getcwd(), 'articles', filename) | |
f = open(path, 'wb') | |
f.write(content.encode('utf-8')) | |
f.close() | |
def main(): | |
""" | |
Main execution | |
""" | |
# grab RSS data and parse it | |
feed = feedparser.parse(RSS_URL) | |
# loop through each article/RSS item | |
for entry in feed.entries: | |
# for clarity, assign the entry ID (which is the article url) | |
# into a new variable | |
url = entry['id'] | |
# fetch article using url | |
r = requests.get(url) | |
# save to disk or print an error message | |
if r.ok: | |
save_article(entry['title'], r.text) | |
else: | |
print('Could not retrieve file: {}'.format(url)) | |
########################################################################## | |
## Execution | |
########################################################################## | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment