Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
A generalized script to gather data from DOE's College Scorecard site using JSON
'''
A generalized script to gather data from
DOE's College Scorecard site using JSON
USAGE
python collegescorecard_v2.py 'https://api.data.gov/ed/collegescorecard/v1/schools.json?school.ownership=2&fields=id,school.name,2014.student.share_firstgeneration&page=0&per_page=100&api_key=MY_API_KEY_HIDDEN'
'''
#import libraries
#the time library allows us to pause for a second
from time import sleep
#these libraries expand the functionality
#of your existing code
import csv, json, urllib2, math, argparse
#main scraper function
#accepts the name of a csv file as a parameter
def scrape_function(query):
#find where the field list starts in the query string
start = query.index('fields=') + len('fields=')
#find where the field list ends in the query string
end = query.index('&',start)
#store the fields from the query to a header variable
header = query[start:end].split(',')
#open a csv file and define the headers
with open('scorecard-data.csv','wb') as f:
#whenever we want to write to our csv,
#we'll call 'writer' and tell it what to do
writer = csv.writer(f)
#write the header to the first row in our csv
writer.writerow(header)
#report back to us if it's successful
print 'New file created...'
#load initial json_data
#report that we're loading the data
print 'Loading JSON data...'
#use urllib2 library to open the url
#and store the contents into a file
json_file = urllib2.urlopen(query).read()
#use json library to parse the file
#into a structured format readable in python
json_data = json.loads(json_file)
#get the number of entries in the data
#which we can think of as rows in a spreadsheet
total_rows = json_data['metadata']['total']
per_page = json_data['metadata']['per_page']
current_page = json_data['metadata']['page']
#calculate the number of pages we'll need
#to go through to capture all the data
last_page = int(math.ceil(total_rows/float(per_page))) - 1
print 'Metadata captured ...'
#loop through the pages and write the data to your csv
while current_page <= last_page:
#report that we're loading the data (again)
print 'Loading JSON data...'
#use urllib2 library to open the url
#and store the contents into a file (again)
json_file = urllib2.urlopen(query).read()
#use json library to parse the file
#into a structured format readable in python (again)
json_data = json.loads(json_file)
#loop through the results on each page and write each to a row
for result in json_data['results']:
#create a blank list called 'row'
row = []
#loop through the header list and use it to match
#each field in the data to the field in the spreadsheet
for field in header:
#add each datapoint in the proper order to our row list
row.append(result[field])
#write the row to our spreadsheet
writer.writerow(row)
#report back to us
print "Page",current_page,"written to CSV..."
#increment the page counter
current_page += 1
#queue up the next page of the query
query += '&page='+str(current_page)
#report that we're done with this step
#and we're taking a short break
print 'Sleeping...'
#sleep for 3 seconds, because we want to be
#sensitive to the computing demands of automated
#scraping of data (we're so nice!)
sleep(3)
#this is what executes when you run collegescorecard_v2.py
if __name__ == '__main__':
#use a built-in parser to accept the query as an argument to
parser = argparse.ArgumentParser(description='Download the JSON data from CollegeScorecard and save as CSV')
parser.add_argument('query',help='Enter a valid API query')
args = parser.parse_args()
#run our main scraping function, feeding in a query
scrape_function(args.query)
print '...done'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment