Skip to content

Instantly share code, notes, and snippets.

@jczaplew
Created May 30, 2013 15:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jczaplew/5678587 to your computer and use it in GitHub Desktop.
Save jczaplew/5678587 to your computer and use it in GitHub Desktop.
EIA Nuclear Powerplant data scraper
## Script to scrape all nuclear powerplant data from the EIA database
## and create a CSV
## Also requires an EIA API key, which you can register for here http://www.eia.gov/beta/api/register.cfm
## John J Czaplewski | jczaplew@gmail.com | May, 2013
import urllib2
import json
import csv
#Create a CSV file and writer
csvWriter = csv.writer(open('results.csv', 'wb+'))
csvWriter.writerow(['name', 'fuel', 'generator', 'series_id', 'lat', 'lon', 'source', 'data_2012', 'data_2011', 'data_2010', 'data_2009', 'data_2008', 'data_2007', 'data_2006', 'data_2005', 'data_2004', 'data_2003', 'data_2002', 'data_2001'])
#First query finds the IDs of all the available powerplants
request1 = urllib2.urlopen('http://api.eia.gov/category/?api_key=BA406DCCD54844844DC10C573C176695&category_id=1018')
powerplants = json.load(request1)
#Now we loop through the results and package all of the IDs into a list for use later
powerplantIds = []
i = 0
while (i < len(powerplants["category"]["childcategories"])):
powerplantIds.append(powerplants["category"]["childcategories"][i]["category_id"])
i+=1
# Start looping through the list of powerplants that was just created...
j = 0
while (j < len(powerplantIds)):
#Start navigating down the JSON...for each powerplant, find all available categories of data
request2 = urllib2.urlopen('http://api.eia.gov/category/?api_key=BA406DCCD54844844DC10C573C176695&category_id=' + str(powerplantIds[j]))
powerplantNetGen = json.load(request2)
x = 0
#Loop through all available categories for each powerplant, and grab only the ones that are annual data for a specific type of fuel source
keepers = []
while (x < len(powerplantNetGen["category"]["childseries"])):
name = powerplantNetGen["category"]["childseries"][x]["name"]
name = name.split(" : ")
if name[2] == "Nuclear" and powerplantNetGen["category"]["childseries"][x]["f"] == "A" :
keepers.append(powerplantNetGen["category"]["childseries"][x]["series_id"])
x += 1
#Now that we know all the fuel types for a powerplant, we're going to request that type's most specific data
y = 0
while (y < len(keepers)) :
request3 = urllib2.urlopen('http://api.eia.gov/series/?series_id=' + keepers[y] + '&api_key=BA406DCCD54844844DC10C573C176695')
stationData = json.load(request3)
name = stationData["series"][0]["name"]
nameSanitized = name.replace("'", "")
nameSanitized = nameSanitized.split(" : ")
#We have the data, just have to clean it up before inserting into the database
years = ["2012", "2011", "2010", "2009", "2008", "2007", "2006", "2005", "2004", "2003", "2002", "2001"]
yearData = []
z = 0
while z < len(years):
try:
if stationData["series"][0]["data"][z][0] == years[z] :
yearData.append(stationData["series"][0]["data"][z][1])
else :
yearData.append("0")
except IndexError:
yearData.append("0")
z += 1
# Finally write a new row to the CSV
csvWriter.writerow([nameSanitized[1], nameSanitized[2], nameSanitized[3], stationData["series"][0]["series_id"], stationData["series"][0]["lat"], stationData["series"][0]["lon"], stationData["series"][0]["source"], yearData[0], yearData[1], yearData[2], yearData[3], yearData[4], yearData[5], yearData[6], yearData[7], yearData[8], yearData[9], yearData[10], yearData[11]])
print "Inserted a row"
y += 1
#Now do it again, and again
print "Done with " + str(j) + " of " + str(len(powerplantIds))
j += 1
print "Finished!"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment