martyworm/NYTscrape.py

## NYTscrape.py
from __future__ import division
from nytimesarticle import articleAPI

import csv
import sys
import re
import urllib.request
import time
import requests
import json
import math

def main():

    #Get article snippets from NYtimes website based on parameters and ultimately put into "final_snippets"
    stuff2017 = search_nyt_year("Trump", 2017)
    article_snippets = format_articles(stuff2017)

# Takes the dict of article snippets and writes it all to a CSV file
    with open('test.csv', 'w') as testfile:
        f = csv.writer(testfile)
        for x in article_snippets:
            f.writerow([x["snippet"]])

# reads the CSV file and formats it even more by making everything lowercase and removing numbers and punctuation
    with open('test.csv', 'r') as testreadfile:
        final_snippets = ""
        for line in testreadfile:
            if line.startswith('"b'):
                for word in line[2:]:
                    if word.isalpha() or word.isspace():
                        final_snippets = final_snippets.lower() + word.lower()
                        #The scraped data was showing "xexx" instead of quotes, so I'm just removing that little odd string
                        final_snippets = final_snippets.replace("xexx", "")
            elif line.startswith("b"):
                for word in line[1:]:
                    if word.isalpha() or word.isspace():
                        final_snippets = final_snippets.lower() + word.lower()
                        #The scraped data was showing "xexx" instead of quotes, so I'm just removing that little odd string
                        final_snippets = final_snippets.replace("xexx", "")

#writes the final formatted words, one per line
    with open('test.csv', 'w') as finalfile:
        z = csv.writer(finalfile)
        final_snippets = final_snippets.replace(" ", "\n")
        final_snippets = final_snippets.replace(",", "")
        z.writerow([final_snippets])

#Search NYTimes and grab a bunch of messy data
def search_nyt_year(term, year):
    # set key
    key="APIKEY"#my specific API key
    # set base url
    base_url="http://api.nytimes.com/svc/search/v2/articlesearch"
    # set response format
    response_format=".json"
    # set search parameters
    search_params = {"q":term,
    "apikey":key,
    "begin_date":str(year)+"0721", # date must be in YYYYMMDD format

    "end_date":str(year)+"0722"}
    # make request
    r = requests.get(base_url+response_format, params=search_params)
    # convert to a dictionary
    data=json.loads(r.text)
    # get number of hits
    hits = data['response']['meta']['hits']
    print("number of hits: " + str(hits))
    # get number of pages
    pages = int(math.ceil(hits/10))
    # make an empty list where we'll hold all of our docs for every page
    all_docs = []
    # now we're ready to loop through the pages

    for i in range(3): #ideally, in range(pages) will be there, just testing smaller numbers to get bugs out
        print("collecting page " + str(i))
        #slow things down on server load to prevent KeyErrors
        time.sleep(0.8)
        # set the page parameter
        search_params['page'] = i
        # make request
        r = requests.get(base_url+response_format, params=search_params)
        # get text and convert to a dictionary
        data=json.loads(r.text)
        # get just the docs
        docs = data['response']['docs']
        # add those docs to the big list
        all_docs = all_docs + docs
    return all_docs

#format all the stuff we got into a nice little KEY: VALUE: dict
def format_articles(unformatted_docs):
    formatted = []
    for i in unformatted_docs:
        dic = {}
        dic['section'] = i['section_name']
        dic['newsdesk'] = i['news_desk']
        dic['id'] = i['_id']
        dic['snippet'] = i['snippet'].encode("utf8")
        dic['headline'] = i['headline']['main'].encode("utf8")
        dic['date'] = i['pub_date'][0:10] # cutting time of day.
        formatted.append(dic)
    return(formatted)


if __name__ == "__main__":
    main()
	from __future__ import division
	from nytimesarticle import articleAPI

	import csv
	import sys
	import re
	import urllib.request
	import time
	import requests
	import json
	import math

	def main():

	#Get article snippets from NYtimes website based on parameters and ultimately put into "final_snippets"
	stuff2017 = search_nyt_year("Trump", 2017)
	article_snippets = format_articles(stuff2017)

	# Takes the dict of article snippets and writes it all to a CSV file
	with open('test.csv', 'w') as testfile:
	f = csv.writer(testfile)
	for x in article_snippets:
	f.writerow([x["snippet"]])

	# reads the CSV file and formats it even more by making everything lowercase and removing numbers and punctuation
	with open('test.csv', 'r') as testreadfile:
	final_snippets = ""
	for line in testreadfile:
	if line.startswith('"b'):
	for word in line[2:]:
	if word.isalpha() or word.isspace():
	final_snippets = final_snippets.lower() + word.lower()
	#The scraped data was showing "xexx" instead of quotes, so I'm just removing that little odd string
	final_snippets = final_snippets.replace("xexx", "")
	elif line.startswith("b"):
	for word in line[1:]:
	if word.isalpha() or word.isspace():
	final_snippets = final_snippets.lower() + word.lower()
	#The scraped data was showing "xexx" instead of quotes, so I'm just removing that little odd string
	final_snippets = final_snippets.replace("xexx", "")

	#writes the final formatted words, one per line
	with open('test.csv', 'w') as finalfile:
	z = csv.writer(finalfile)
	final_snippets = final_snippets.replace(" ", "\n")
	final_snippets = final_snippets.replace(",", "")
	z.writerow([final_snippets])

	#Search NYTimes and grab a bunch of messy data
	def search_nyt_year(term, year):
	# set key
	key="APIKEY"#my specific API key
	# set base url
	base_url="http://api.nytimes.com/svc/search/v2/articlesearch"
	# set response format
	response_format=".json"
	# set search parameters
	search_params = {"q":term,
	"apikey":key,
	"begin_date":str(year)+"0721", # date must be in YYYYMMDD format

	"end_date":str(year)+"0722"}
	# make request
	r = requests.get(base_url+response_format, params=search_params)
	# convert to a dictionary
	data=json.loads(r.text)
	# get number of hits
	hits = data['response']['meta']['hits']
	print("number of hits: " + str(hits))
	# get number of pages
	pages = int(math.ceil(hits/10))
	# make an empty list where we'll hold all of our docs for every page
	all_docs = []
	# now we're ready to loop through the pages

	for i in range(3): #ideally, in range(pages) will be there, just testing smaller numbers to get bugs out
	print("collecting page " + str(i))
	#slow things down on server load to prevent KeyErrors
	time.sleep(0.8)
	# set the page parameter
	search_params['page'] = i
	# make request
	r = requests.get(base_url+response_format, params=search_params)
	# get text and convert to a dictionary
	data=json.loads(r.text)
	# get just the docs
	docs = data['response']['docs']
	# add those docs to the big list
	all_docs = all_docs + docs
	return all_docs

	#format all the stuff we got into a nice little KEY: VALUE: dict
	def format_articles(unformatted_docs):
	formatted = []
	for i in unformatted_docs:
	dic = {}
	dic['section'] = i['section_name']
	dic['newsdesk'] = i['news_desk']
	dic['id'] = i['_id']
	dic['snippet'] = i['snippet'].encode("utf8")
	dic['headline'] = i['headline']['main'].encode("utf8")
	dic['date'] = i['pub_date'][0:10] # cutting time of day.
	formatted.append(dic)
	return(formatted)



	if __name__ == "__main__":
	main()