Skip to content

Instantly share code, notes, and snippets.

@martyworm
Last active July 25, 2017 18:16
Show Gist options
  • Save martyworm/1284a53079c4b37be73931fae1982f56 to your computer and use it in GitHub Desktop.
Save martyworm/1284a53079c4b37be73931fae1982f56 to your computer and use it in GitHub Desktop.
Gets X amount of article snippets from the NYTimes website and writes all of the words to a CSV file, one word per line
from __future__ import division
from nytimesarticle import articleAPI
import csv
import sys
import re
import urllib.request
import time
import requests
import json
import math
def main():
#Get article snippets from NYtimes website based on parameters and ultimately put into "final_snippets"
stuff2017 = search_nyt_year("Trump", 2017)
article_snippets = format_articles(stuff2017)
# Takes the dict of article snippets and writes it all to a CSV file
with open('test.csv', 'w') as testfile:
f = csv.writer(testfile)
for x in article_snippets:
f.writerow([x["snippet"]])
# reads the CSV file and formats it even more by making everything lowercase and removing numbers and punctuation
with open('test.csv', 'r') as testreadfile:
final_snippets = ""
for line in testreadfile:
if line.startswith('"b'):
for word in line[2:]:
if word.isalpha() or word.isspace():
final_snippets = final_snippets.lower() + word.lower()
#The scraped data was showing "xexx" instead of quotes, so I'm just removing that little odd string
final_snippets = final_snippets.replace("xexx", "")
elif line.startswith("b"):
for word in line[1:]:
if word.isalpha() or word.isspace():
final_snippets = final_snippets.lower() + word.lower()
#The scraped data was showing "xexx" instead of quotes, so I'm just removing that little odd string
final_snippets = final_snippets.replace("xexx", "")
#writes the final formatted words, one per line
with open('test.csv', 'w') as finalfile:
z = csv.writer(finalfile)
final_snippets = final_snippets.replace(" ", "\n")
final_snippets = final_snippets.replace(",", "")
z.writerow([final_snippets])
#Search NYTimes and grab a bunch of messy data
def search_nyt_year(term, year):
# set key
key="APIKEY"#my specific API key
# set base url
base_url="http://api.nytimes.com/svc/search/v2/articlesearch"
# set response format
response_format=".json"
# set search parameters
search_params = {"q":term,
"apikey":key,
"begin_date":str(year)+"0721", # date must be in YYYYMMDD format
"end_date":str(year)+"0722"}
# make request
r = requests.get(base_url+response_format, params=search_params)
# convert to a dictionary
data=json.loads(r.text)
# get number of hits
hits = data['response']['meta']['hits']
print("number of hits: " + str(hits))
# get number of pages
pages = int(math.ceil(hits/10))
# make an empty list where we'll hold all of our docs for every page
all_docs = []
# now we're ready to loop through the pages
for i in range(3): #ideally, in range(pages) will be there, just testing smaller numbers to get bugs out
print("collecting page " + str(i))
#slow things down on server load to prevent KeyErrors
time.sleep(0.8)
# set the page parameter
search_params['page'] = i
# make request
r = requests.get(base_url+response_format, params=search_params)
# get text and convert to a dictionary
data=json.loads(r.text)
# get just the docs
docs = data['response']['docs']
# add those docs to the big list
all_docs = all_docs + docs
return all_docs
#format all the stuff we got into a nice little KEY: VALUE: dict
def format_articles(unformatted_docs):
formatted = []
for i in unformatted_docs:
dic = {}
dic['section'] = i['section_name']
dic['newsdesk'] = i['news_desk']
dic['id'] = i['_id']
dic['snippet'] = i['snippet'].encode("utf8")
dic['headline'] = i['headline']['main'].encode("utf8")
dic['date'] = i['pub_date'][0:10] # cutting time of day.
formatted.append(dic)
return(formatted)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment