Last active
July 25, 2017 18:16
-
-
Save martyworm/1284a53079c4b37be73931fae1982f56 to your computer and use it in GitHub Desktop.
Gets X amount of article snippets from the NYTimes website and writes all of the words to a CSV file, one word per line
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division | |
from nytimesarticle import articleAPI | |
import csv | |
import sys | |
import re | |
import urllib.request | |
import time | |
import requests | |
import json | |
import math | |
def main(): | |
#Get article snippets from NYtimes website based on parameters and ultimately put into "final_snippets" | |
stuff2017 = search_nyt_year("Trump", 2017) | |
article_snippets = format_articles(stuff2017) | |
# Takes the dict of article snippets and writes it all to a CSV file | |
with open('test.csv', 'w') as testfile: | |
f = csv.writer(testfile) | |
for x in article_snippets: | |
f.writerow([x["snippet"]]) | |
# reads the CSV file and formats it even more by making everything lowercase and removing numbers and punctuation | |
with open('test.csv', 'r') as testreadfile: | |
final_snippets = "" | |
for line in testreadfile: | |
if line.startswith('"b'): | |
for word in line[2:]: | |
if word.isalpha() or word.isspace(): | |
final_snippets = final_snippets.lower() + word.lower() | |
#The scraped data was showing "xexx" instead of quotes, so I'm just removing that little odd string | |
final_snippets = final_snippets.replace("xexx", "") | |
elif line.startswith("b"): | |
for word in line[1:]: | |
if word.isalpha() or word.isspace(): | |
final_snippets = final_snippets.lower() + word.lower() | |
#The scraped data was showing "xexx" instead of quotes, so I'm just removing that little odd string | |
final_snippets = final_snippets.replace("xexx", "") | |
#writes the final formatted words, one per line | |
with open('test.csv', 'w') as finalfile: | |
z = csv.writer(finalfile) | |
final_snippets = final_snippets.replace(" ", "\n") | |
final_snippets = final_snippets.replace(",", "") | |
z.writerow([final_snippets]) | |
#Search NYTimes and grab a bunch of messy data | |
def search_nyt_year(term, year): | |
# set key | |
key="APIKEY"#my specific API key | |
# set base url | |
base_url="http://api.nytimes.com/svc/search/v2/articlesearch" | |
# set response format | |
response_format=".json" | |
# set search parameters | |
search_params = {"q":term, | |
"apikey":key, | |
"begin_date":str(year)+"0721", # date must be in YYYYMMDD format | |
"end_date":str(year)+"0722"} | |
# make request | |
r = requests.get(base_url+response_format, params=search_params) | |
# convert to a dictionary | |
data=json.loads(r.text) | |
# get number of hits | |
hits = data['response']['meta']['hits'] | |
print("number of hits: " + str(hits)) | |
# get number of pages | |
pages = int(math.ceil(hits/10)) | |
# make an empty list where we'll hold all of our docs for every page | |
all_docs = [] | |
# now we're ready to loop through the pages | |
for i in range(3): #ideally, in range(pages) will be there, just testing smaller numbers to get bugs out | |
print("collecting page " + str(i)) | |
#slow things down on server load to prevent KeyErrors | |
time.sleep(0.8) | |
# set the page parameter | |
search_params['page'] = i | |
# make request | |
r = requests.get(base_url+response_format, params=search_params) | |
# get text and convert to a dictionary | |
data=json.loads(r.text) | |
# get just the docs | |
docs = data['response']['docs'] | |
# add those docs to the big list | |
all_docs = all_docs + docs | |
return all_docs | |
#format all the stuff we got into a nice little KEY: VALUE: dict | |
def format_articles(unformatted_docs): | |
formatted = [] | |
for i in unformatted_docs: | |
dic = {} | |
dic['section'] = i['section_name'] | |
dic['newsdesk'] = i['news_desk'] | |
dic['id'] = i['_id'] | |
dic['snippet'] = i['snippet'].encode("utf8") | |
dic['headline'] = i['headline']['main'].encode("utf8") | |
dic['date'] = i['pub_date'][0:10] # cutting time of day. | |
formatted.append(dic) | |
return(formatted) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment