Skip to content

Instantly share code, notes, and snippets.

@Andrew62
Last active August 29, 2015 14:15
Show Gist options
  • Save Andrew62/ff0dde78d8146882704c to your computer and use it in GitHub Desktop.
Save Andrew62/ff0dde78d8146882704c to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""
Created on Sat Feb 14 08:45:03 2015
@author: andrew
Data Sources
Grab the NYT rss feed
Identify the articles you want
write articles out as UTF-8
Allen's answer: https://gist.github.com/looselycoupled/fe2f22979dde458a7c56
"""
import re
import requests
import feedparser as fp
def slugify(value):
"""
Converts to ASCII. Converts spaces to hyphens. Removes characters that
aren't alphanumerics, underscores, or hyphens. Converts to lowercase.
Also strips leading and trailing whitespace.
Note: This is not production code
"""
value = value.encode('ascii', 'ignore').decode('ascii')
value = re.sub('[^\w\s-]', '', value).strip().lower()
return re.sub('[-\s]+', '-', value)
def get_items(url):
"""
retrieves the rss feed and yields the link to
the articles within the feed
"""
parser = fp.parse(url)
for item in parser["entries"]:
yield {"title":item["title"],
"link": item["link"]}
def write_article(title, link):
"""
function retrieves a link and writes the contents
of the link to a file with the same name as the
article title
"""
response = requests.get(link)
if response.ok:
fileName = "{0}.html".format(slugify(title))
with open(fileName, "w") as target:
text = response.text
target.write(text.encode("utf-8"))
else:
print "Could not retrieve article {0}".format(title)
print link
def main():
nyt_rss = "http://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml"
items = list(get_items(nyt_rss))
for item in items:
print item["title"]
write_article(item["title"], item["link"])
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment