Skip to content

Instantly share code, notes, and snippets.

@looselycoupled
Last active June 17, 2017 04:31
Show Gist options
  • Save looselycoupled/fe2f22979dde458a7c56 to your computer and use it in GitHub Desktop.
Save looselycoupled/fe2f22979dde458a7c56 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
"""
Model answer for RSS exercise
NOTE: In order to use this program, be sure to create a sub-directory
called 'articles'
"""
##########################################################################
## Imports
##########################################################################
import os
import re
import requests
import feedparser
##########################################################################
## Module Variables/Constants
##########################################################################
RSS_URL = 'http://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml'
##########################################################################
## Functions
##########################################################################
def slugify(value):
"""
Converts to ASCII. Converts spaces to hyphens. Removes characters that
aren't alphanumerics, underscores, or hyphens. Converts to lowercase.
Also strips leading and trailing whitespace.
Note: This is not production code
"""
value = value.encode('ascii', 'ignore').decode('ascii')
value = re.sub('[^\w\s-]', '', value).strip().lower()
return re.sub('[-\s]+', '-', value)
def save_article(title, content):
"""
Save HTML content using a slugged version of the title as the basis for
the filename
"""
filename = '%s.html' % slugify(title)
path = os.path.join(os.getcwd(), 'articles', filename)
f = open(path, 'wb')
f.write(content.encode('utf-8'))
f.close()
def main():
"""
Main execution
"""
# grab RSS data and parse it
feed = feedparser.parse(RSS_URL)
# loop through each article/RSS item
for entry in feed.entries:
# for clarity, assign the entry ID (which is the article url)
# into a new variable
url = entry['id']
# fetch article using url
r = requests.get(url)
# save to disk or print an error message
if r.ok:
save_article(entry['title'], r.text)
else:
print('Could not retrieve file: {}'.format(url))
##########################################################################
## Execution
##########################################################################
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment