Last active
August 29, 2015 14:15
-
-
Save Andrew62/ff0dde78d8146882704c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Sat Feb 14 08:45:03 2015 | |
@author: andrew | |
Data Sources | |
Grab the NYT rss feed | |
Identify the articles you want | |
write articles out as UTF-8 | |
Allen's answer: https://gist.github.com/looselycoupled/fe2f22979dde458a7c56 | |
""" | |
import re | |
import requests | |
import feedparser as fp | |
def slugify(value): | |
""" | |
Converts to ASCII. Converts spaces to hyphens. Removes characters that | |
aren't alphanumerics, underscores, or hyphens. Converts to lowercase. | |
Also strips leading and trailing whitespace. | |
Note: This is not production code | |
""" | |
value = value.encode('ascii', 'ignore').decode('ascii') | |
value = re.sub('[^\w\s-]', '', value).strip().lower() | |
return re.sub('[-\s]+', '-', value) | |
def get_items(url): | |
""" | |
retrieves the rss feed and yields the link to | |
the articles within the feed | |
""" | |
parser = fp.parse(url) | |
for item in parser["entries"]: | |
yield {"title":item["title"], | |
"link": item["link"]} | |
def write_article(title, link): | |
""" | |
function retrieves a link and writes the contents | |
of the link to a file with the same name as the | |
article title | |
""" | |
response = requests.get(link) | |
if response.ok: | |
fileName = "{0}.html".format(slugify(title)) | |
with open(fileName, "w") as target: | |
text = response.text | |
target.write(text.encode("utf-8")) | |
else: | |
print "Could not retrieve article {0}".format(title) | |
print link | |
def main(): | |
nyt_rss = "http://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml" | |
items = list(get_items(nyt_rss)) | |
for item in items: | |
print item["title"] | |
write_article(item["title"], item["link"]) | |
if __name__ == "__main__": | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment