Created
August 10, 2015 23:57
-
-
Save clhenrick/e846432787a6a80065b9 to your computer and use it in GitHub Desktop.
node js script to parse csv data into json for the Woodrow Wilson Center's "Tracking the Energy Titans" data-viz
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Import modules */ | |
var csv = require('fast-csv'), | |
fs = require('fs'); | |
/* "global" variables */ | |
var inputFilePath = '../data/csv/', | |
writeFilePaths = ['../data/country-data.json','../data/wow-factor.json'] | |
rawData = {}, | |
outData = null, | |
wowData = null, | |
numFiles = null, | |
count = 0; | |
/* | |
Process: | |
1. read CSV files from inputFilePath | |
2. read JSON files from writeFilePath (poor variable naming, I know) | |
3. for each CSV parse data & assign values to JSON | |
4. write country level data & sources to ./country-data.json | |
5. write the wow factors to ./wow-factor.json | |
Notes: | |
- In parseData() once all the CSV input files have been read and parsed | |
the output files will be written to the directory containing this script. | |
- Optionally you may comment out the writeOutFile() calls and instead console.log | |
the data to inspect it. | |
- A helpful way to do this is on the command line is by piping the output: | |
node parse-data.js | python -m json.tool | less | |
*/ | |
// read the output JSON files to grab the data structure | |
function readOutFiles(filePathArray, callback){ | |
fs.readFile(filePathArray[0],function(err,data) { | |
if (err) { | |
return console.log('read file error: ', err); | |
} | |
outData = JSON.parse(data); | |
}); | |
fs.readFile(filePathArray[1],function(err,data) { | |
if (err) { | |
return console.log('read file error: ', err); | |
} | |
wowData = JSON.parse(data); | |
}); | |
if (callback && typeof callback === 'function') { | |
callback(); | |
} | |
} | |
// get it all going | |
function begin(){ | |
fs.readdir(inputFilePath, function(err, files){ | |
// returns an array of file names | |
numFiles = files.length; | |
for (var i = 0; i<files.length; i++) { | |
parseFile(files[i]); | |
} | |
}); | |
} | |
// iterate over the CSV files | |
// converts CSV data to multi-dimensional arrays | |
// stores temporarily in rawData | |
function parseFile(fileName) { | |
var chunk = []; | |
csv | |
.fromPath(inputFilePath + fileName) | |
.on('data', function(data, error){ | |
if (error) console.log('csv parse error: ', error); | |
// console.log(fileName, ': ', data); | |
chunk.push(data); | |
}) | |
.on('end', function(){ | |
rawData[fileName] = chunk; | |
count ++; | |
if (count === numFiles) { | |
// console.log('done: ', JSON.stringify(rawData), '\n'); | |
parseData(rawData); | |
} | |
}); | |
} | |
// parses multi-dimensional arrays from csv module | |
function parseData(parsedCSV) { | |
var count = 0; | |
// console.log('parsedCSV: ', parsedCSV); | |
for (var k in parsedCSV) { | |
var obj = parsedCSV[k]; | |
if (k === 'us_total.csv') { | |
formatData('us', 'total', obj); | |
} else if (k === 'canada_total.csv') { | |
formatData('ca', 'total', obj); | |
} else if (k === 'china_total.csv') { | |
formatData('zh', 'total', obj); | |
} else if (k === 'us_per_capita.csv') { | |
formatData('us', 'per_capita', obj); | |
} else if (k === 'canada_per_capita.csv') { | |
formatData('ca', 'per_capita', obj); | |
} else if (k === 'china_per_capita.csv') { | |
formatData('zh', 'per_capita', obj); | |
} else if (k === 'wow_factor.csv') { | |
formatWows(obj); | |
} else if (k==='sources.csv') { | |
formatSources(obj); | |
} | |
count +=1; | |
if (count===numFiles) { | |
/* | |
comment these two lines out to not write output files | |
*/ | |
writeOutFile("country-data.json", JSON.stringify(outData)); | |
writeOutFile("wow-factor.json", JSON.stringify(wowData)); | |
/* | |
uncomment these two lines to test output | |
*/ | |
// console.log(JSON.stringify(outData)); | |
// console.log(JSON.stringify(wowData)); | |
} | |
} | |
} | |
function iterateMultiArray(arr,x,y) { | |
// iterates over a multi-dimensional array | |
// returns an array of numeric values | |
var i = 0, l = arr.length, arrToReturn = []; | |
for (i; i<l; i++) { | |
if (i===y) { | |
for (var j=0; j<arr[i].length; j++) { | |
if (j>x) { | |
var tmp = arr[i][j] | |
arrToReturn.push(filterFloat(tmp)); | |
} | |
} | |
} | |
} | |
return mapData2Years(arrToReturn); | |
} | |
function mapData2Years(arr) { | |
// maps each value of an array to its corresponding year | |
// preferred data format for d3 | |
var years = [2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014]; | |
var toReturn = []; | |
for (var i = 0; i < arr.length; i++) { | |
var obj = {}; | |
obj.val = arr[i]; | |
obj.year = years[i]; | |
// only add the data if it isn't null | |
if (obj.val >= 0) { | |
toReturn.push(obj); | |
} | |
} | |
return toReturn; | |
} | |
function iterateMultiArraySource(arr,x,y) { | |
// iterates over a multi-dimensional array | |
// returns an array of text values | |
var i = 0, l = arr.length, arrToReturn = []; | |
for (i; i<l; i++) { | |
if (i===y) { | |
for (var j=0; j<arr[i].length; j++) { | |
if (j>x) { | |
var tmp = arr[i][j] | |
arrToReturn.push(tmp); | |
} | |
} | |
} | |
} | |
return arrToReturn; | |
} | |
function iterateMultiArrayWow(arr,x,y) { | |
// iterates over a multi-dimensional array for Wow Factor data | |
// returns a string of text for the wow factor | |
var i = 0, l = arr.length, val; | |
for (i; i<l; i++) { | |
if (i === y) { | |
for (var j=0; j<arr[i].length; j++) { | |
if (j === x && arr[i][j].length > 0) { | |
val = arr[i][j]; | |
// remove any extra whitespace, tabs, newlines, etc. | |
val = val.replace(/\s\s+/g, ' '); | |
} else if (j === x && arr[i][j].length === 0) { | |
// write placeholder text in case value is empty | |
val = '*** place holder text ***'; | |
console.log('wow val: ', val); | |
} | |
} | |
} | |
} | |
return val; | |
} | |
// cast data type from string to a number type | |
// if no data the value will be NaN and output value in JSON will be null | |
function filterFloat(value) { | |
value = value.replace(/ /g,''); | |
if(/^(\-|\+)?([0-9]+(\.[0-9]+)?|Infinity)$/ | |
.test(value)) | |
return Number(value); | |
return NaN; | |
} | |
// takes input data and assigns it to country-data.json | |
function formatData(country, dataClass, arr) { | |
var index = 0; | |
if (country==="ca") { | |
index = 1; | |
} else if (country==="zh") { | |
index = 2; | |
} | |
// console.log(country, ' ', dataClass, ' test: ', iterateMultiArray(arr,2,21)); | |
outData[country].production.coal[dataClass] = iterateMultiArray(arr,2,2); | |
outData[country].production.gas[dataClass] = iterateMultiArray(arr,2,3); | |
outData[country].production.oil[dataClass] = iterateMultiArray(arr,2,4); | |
outData[country].production.nuclear[dataClass] = iterateMultiArray(arr,2,5); | |
outData[country].production.hydro[dataClass] = iterateMultiArray(arr,2,6); | |
outData[country].production.renewables[dataClass] = iterateMultiArray(arr,2,7); | |
outData[country].imports.coal[dataClass] = iterateMultiArray(arr,2,10); | |
outData[country].imports.gas[dataClass] = iterateMultiArray(arr,2, 11); | |
outData[country].imports.crude_oil[dataClass] = iterateMultiArray(arr,2,12); | |
outData[country].imports.oil_products[dataClass] = iterateMultiArray(arr,2,13); | |
outData[country].imports.electricity[dataClass] = iterateMultiArray(arr,2,14); | |
outData[country].exports.coal[dataClass] = iterateMultiArray(arr,2,17); | |
outData[country].exports.gas[dataClass] = iterateMultiArray(arr,2,18); | |
outData[country].exports.crude_oil[dataClass] = iterateMultiArray(arr,2,19); | |
outData[country].exports.oil_products[dataClass] = iterateMultiArray(arr,2,20); | |
outData[country].exports.electricity[dataClass] = iterateMultiArray(arr,2,21); | |
outData[country].consumption.power_sector[dataClass] = iterateMultiArray(arr,2,24); | |
outData[country].consumption.transport[dataClass] = iterateMultiArray(arr,2,25); | |
outData[country].consumption.industry[dataClass] = iterateMultiArray(arr,2,26); | |
outData[country].consumption.residential[dataClass] = iterateMultiArray(arr,2,27); | |
outData[country].consumption.commercial[dataClass] = iterateMultiArray(arr,2,28); | |
outData[country].impact.co2_emissions[dataClass] = iterateMultiArray(arr,2,31); | |
outData[country].impact.water_use[dataClass] = iterateMultiArray(arr,2,32); | |
outData[country].impact.thermal_power_sector_losses[dataClass] = iterateMultiArray(arr,2,33); | |
} | |
// writes the data sources to country-data.json | |
function formatSources(arr) { | |
var i = 0, l=arr.length, toWrite=null; | |
// var i corresponds to line # -1 in sources.csv | |
for (i; i<l; i++) { | |
toWrite = iterateMultiArraySource(arr,1,i); | |
switch(i) { | |
case 2: | |
outData['us'].production.coal.source = toWrite[2]; | |
outData['ca'].production.coal.source = toWrite[0]; | |
outData['zh'].production.coal.source = toWrite[1]; | |
case 3: | |
outData['us'].production.gas.source = toWrite[2]; | |
outData['ca'].production.gas.source = toWrite[0]; | |
outData['zh'].production.gas.source = toWrite[1]; | |
case 4: | |
outData['us'].production.oil.source = toWrite[2]; | |
outData['ca'].production.oil.source = toWrite[0]; | |
outData['zh'].production.oil.source = toWrite[1]; | |
case 5: | |
outData['us'].production.nuclear.source = toWrite[2]; | |
outData['ca'].production.nuclear.source = toWrite[0]; | |
outData['zh'].production.nuclear.source = toWrite[1]; | |
case 6: | |
outData['us'].production.hydro.source = toWrite[2]; | |
outData['ca'].production.hydro.source = toWrite[0]; | |
outData['zh'].production.hydro.source = toWrite[1]; | |
case 7: | |
outData['us'].production.renewables.source = toWrite[2]; | |
outData['ca'].production.renewables.source = toWrite[0]; | |
outData['zh'].production.renewables.source = toWrite[1]; | |
case 10: | |
outData['us'].imports.coal.source = toWrite[2]; | |
outData['ca'].imports.coal.source = toWrite[0]; | |
outData['zh'].imports.coal.source = toWrite[1]; | |
case 11: | |
outData['us'].imports.gas.source = toWrite[2]; | |
outData['ca'].imports.gas.source = toWrite[0]; | |
outData['zh'].imports.gas.source = toWrite[1]; | |
case 12: | |
outData['us'].imports.crude_oil.source = toWrite[2]; | |
outData['ca'].imports.crude_oil.source = toWrite[0]; | |
outData['zh'].imports.crude_oil.source = toWrite[1]; | |
case 13: | |
outData['us'].imports.oil_products.source = toWrite[2]; | |
outData['ca'].imports.oil_products.source = toWrite[0]; | |
outData['zh'].imports.oil_products.source = toWrite[1]; | |
case 14: | |
outData['us'].imports.electricity.source = toWrite[2]; | |
outData['ca'].imports.electricity.source = toWrite[0]; | |
outData['zh'].imports.electricity.source = toWrite[1]; | |
case 17: | |
outData['us'].exports.coal.source = toWrite[2]; | |
outData['ca'].exports.coal.source = toWrite[0]; | |
outData['zh'].exports.coal.source = toWrite[1]; | |
case 18: | |
outData['us'].exports.gas.source = toWrite[2]; | |
outData['ca'].exports.gas.source = toWrite[0]; | |
outData['zh'].exports.gas.source = toWrite[1]; | |
case 19: | |
outData['us'].exports.crude_oil.source = toWrite[2]; | |
outData['ca'].exports.crude_oil.source = toWrite[0]; | |
outData['zh'].exports.crude_oil.source = toWrite[1]; | |
case 20: | |
outData['us'].exports.oil_products.source = toWrite[2]; | |
outData['ca'].exports.oil_products.source = toWrite[0]; | |
outData['zh'].exports.oil_products.source = toWrite[1]; | |
case 21: | |
outData['us'].exports.electricity.source = toWrite[2]; | |
outData['ca'].exports.electricity.source = toWrite[0]; | |
outData['zh'].exports.electricity.source = toWrite[1]; | |
case 24: | |
outData['us'].consumption.power_sector.source = toWrite[2]; | |
outData['ca'].consumption.power_sector.source = toWrite[0]; | |
outData['zh'].consumption.power_sector.source = toWrite[1]; | |
case 25: | |
outData['us'].consumption.transport.source = toWrite[2]; | |
outData['ca'].consumption.transport.source = toWrite[0]; | |
outData['zh'].consumption.transport.source = toWrite[1]; | |
case 26: | |
outData['us'].consumption.industry.source = toWrite[2]; | |
outData['ca'].consumption.industry.source = toWrite[0]; | |
outData['zh'].consumption.industry.source = toWrite[1]; | |
case 27: | |
outData['us'].consumption.residential.source = toWrite[2]; | |
outData['ca'].consumption.residential.source = toWrite[0]; | |
outData['zh'].consumption.residential.source = toWrite[1]; | |
case 28: | |
outData['us'].consumption.commercial.source = toWrite[2]; | |
outData['ca'].consumption.commercial.source = toWrite[0]; | |
outData['zh'].consumption.commercial.source = toWrite[1]; | |
case 31: | |
outData['us'].impact.co2_emissions.source = toWrite[2]; | |
outData['ca'].impact.co2_emissions.source = toWrite[0]; | |
outData['zh'].impact.co2_emissions.source = toWrite[1]; | |
case 32: | |
outData['us'].impact.water_use.source = toWrite[2]; | |
outData['ca'].impact.water_use.source = toWrite[0]; | |
outData['zh'].impact.water_use.source = toWrite[1]; | |
case 33: | |
outData['us'].impact.thermal_power_sector_losses.source = toWrite[2]; | |
outData['ca'].impact.thermal_power_sector_losses.source = toWrite[0]; | |
outData['zh'].impact.thermal_power_sector_losses.source = toWrite[1]; | |
} | |
} | |
} | |
// filters wow factor arrays for per capita & total data | |
function formatWows(arr) { | |
writeWows(arr, 'total', 2); | |
writeWows(arr, 'per_capita', 3); | |
} | |
// writes wow factors to wow-factor.json | |
function writeWows(arr, cat, x) { | |
// console.log(cat, ': ', iterateMultiArrayWow(arr,x,3)); | |
wowData[cat].production.coal = iterateMultiArrayWow(arr,x,2); | |
wowData[cat].production.gas = iterateMultiArrayWow(arr,x,3); | |
wowData[cat].production.oil = iterateMultiArrayWow(arr,x,4); | |
wowData[cat].production.nuclear = iterateMultiArrayWow(arr,x,5); | |
wowData[cat].production.hydro = iterateMultiArrayWow(arr,x,6); | |
wowData[cat].production.renewables = iterateMultiArrayWow(arr,x,7); | |
wowData[cat].imports.coal = iterateMultiArrayWow(arr,x,10); | |
wowData[cat].imports.gas = iterateMultiArrayWow(arr,x, 11); | |
wowData[cat].imports.crude_oil = iterateMultiArrayWow(arr,x,12); | |
wowData[cat].imports.oil_products = iterateMultiArrayWow(arr,x,13); | |
wowData[cat].imports.electricity = iterateMultiArrayWow(arr,x,14); | |
wowData[cat].exports.coal = iterateMultiArrayWow(arr,x,17); | |
wowData[cat].exports.gas = iterateMultiArrayWow(arr,x,18); | |
wowData[cat].exports.crude_oil = iterateMultiArrayWow(arr,x,19); | |
wowData[cat].exports.oil_products = iterateMultiArrayWow(arr,x,20); | |
wowData[cat].exports.electricity = iterateMultiArrayWow(arr,x,21); | |
wowData[cat].consumption.power_sector = iterateMultiArrayWow(arr,x,24); | |
wowData[cat].consumption.transport = iterateMultiArrayWow(arr,x,25); | |
wowData[cat].consumption.industry = iterateMultiArrayWow(arr,x,26); | |
wowData[cat].consumption.residential = iterateMultiArrayWow(arr,x,27); | |
wowData[cat].consumption.commercial = iterateMultiArrayWow(arr,x,28); | |
wowData[cat].impact.co2_emissions = iterateMultiArrayWow(arr,x,31); | |
wowData[cat].impact.thermal_power_sector_losses = iterateMultiArrayWow(arr,x,32); | |
wowData[cat].impact.water_use = iterateMultiArrayWow(arr,x,33); | |
} | |
// writes the final JSON files to stdout | |
function writeOutFile(filePath, data) { | |
fs.writeFile(filePath, data, function(err) { | |
if (err) return console.log('write error: ', err); | |
console.log("sucess!"); | |
}); | |
} | |
// starts the script | |
readOutFiles(writeFilePaths, begin); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment