Skip to content

Instantly share code, notes, and snippets.

@clhenrick
Created August 10, 2015 23:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save clhenrick/e846432787a6a80065b9 to your computer and use it in GitHub Desktop.
Save clhenrick/e846432787a6a80065b9 to your computer and use it in GitHub Desktop.
node js script to parse csv data into json for the Woodrow Wilson Center's "Tracking the Energy Titans" data-viz
/* Import modules */
var csv = require('fast-csv'),
fs = require('fs');
/* "global" variables */
var inputFilePath = '../data/csv/',
writeFilePaths = ['../data/country-data.json','../data/wow-factor.json']
rawData = {},
outData = null,
wowData = null,
numFiles = null,
count = 0;
/*
Process:
1. read CSV files from inputFilePath
2. read JSON files from writeFilePath (poor variable naming, I know)
3. for each CSV parse data & assign values to JSON
4. write country level data & sources to ./country-data.json
5. write the wow factors to ./wow-factor.json
Notes:
- In parseData() once all the CSV input files have been read and parsed
the output files will be written to the directory containing this script.
- Optionally you may comment out the writeOutFile() calls and instead console.log
the data to inspect it.
- A helpful way to do this is on the command line is by piping the output:
node parse-data.js | python -m json.tool | less
*/
// read the output JSON files to grab the data structure
function readOutFiles(filePathArray, callback){
fs.readFile(filePathArray[0],function(err,data) {
if (err) {
return console.log('read file error: ', err);
}
outData = JSON.parse(data);
});
fs.readFile(filePathArray[1],function(err,data) {
if (err) {
return console.log('read file error: ', err);
}
wowData = JSON.parse(data);
});
if (callback && typeof callback === 'function') {
callback();
}
}
// get it all going
function begin(){
fs.readdir(inputFilePath, function(err, files){
// returns an array of file names
numFiles = files.length;
for (var i = 0; i<files.length; i++) {
parseFile(files[i]);
}
});
}
// iterate over the CSV files
// converts CSV data to multi-dimensional arrays
// stores temporarily in rawData
function parseFile(fileName) {
var chunk = [];
csv
.fromPath(inputFilePath + fileName)
.on('data', function(data, error){
if (error) console.log('csv parse error: ', error);
// console.log(fileName, ': ', data);
chunk.push(data);
})
.on('end', function(){
rawData[fileName] = chunk;
count ++;
if (count === numFiles) {
// console.log('done: ', JSON.stringify(rawData), '\n');
parseData(rawData);
}
});
}
// parses multi-dimensional arrays from csv module
function parseData(parsedCSV) {
var count = 0;
// console.log('parsedCSV: ', parsedCSV);
for (var k in parsedCSV) {
var obj = parsedCSV[k];
if (k === 'us_total.csv') {
formatData('us', 'total', obj);
} else if (k === 'canada_total.csv') {
formatData('ca', 'total', obj);
} else if (k === 'china_total.csv') {
formatData('zh', 'total', obj);
} else if (k === 'us_per_capita.csv') {
formatData('us', 'per_capita', obj);
} else if (k === 'canada_per_capita.csv') {
formatData('ca', 'per_capita', obj);
} else if (k === 'china_per_capita.csv') {
formatData('zh', 'per_capita', obj);
} else if (k === 'wow_factor.csv') {
formatWows(obj);
} else if (k==='sources.csv') {
formatSources(obj);
}
count +=1;
if (count===numFiles) {
/*
comment these two lines out to not write output files
*/
writeOutFile("country-data.json", JSON.stringify(outData));
writeOutFile("wow-factor.json", JSON.stringify(wowData));
/*
uncomment these two lines to test output
*/
// console.log(JSON.stringify(outData));
// console.log(JSON.stringify(wowData));
}
}
}
function iterateMultiArray(arr,x,y) {
// iterates over a multi-dimensional array
// returns an array of numeric values
var i = 0, l = arr.length, arrToReturn = [];
for (i; i<l; i++) {
if (i===y) {
for (var j=0; j<arr[i].length; j++) {
if (j>x) {
var tmp = arr[i][j]
arrToReturn.push(filterFloat(tmp));
}
}
}
}
return mapData2Years(arrToReturn);
}
function mapData2Years(arr) {
// maps each value of an array to its corresponding year
// preferred data format for d3
var years = [2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014];
var toReturn = [];
for (var i = 0; i < arr.length; i++) {
var obj = {};
obj.val = arr[i];
obj.year = years[i];
// only add the data if it isn't null
if (obj.val >= 0) {
toReturn.push(obj);
}
}
return toReturn;
}
function iterateMultiArraySource(arr,x,y) {
// iterates over a multi-dimensional array
// returns an array of text values
var i = 0, l = arr.length, arrToReturn = [];
for (i; i<l; i++) {
if (i===y) {
for (var j=0; j<arr[i].length; j++) {
if (j>x) {
var tmp = arr[i][j]
arrToReturn.push(tmp);
}
}
}
}
return arrToReturn;
}
function iterateMultiArrayWow(arr,x,y) {
// iterates over a multi-dimensional array for Wow Factor data
// returns a string of text for the wow factor
var i = 0, l = arr.length, val;
for (i; i<l; i++) {
if (i === y) {
for (var j=0; j<arr[i].length; j++) {
if (j === x && arr[i][j].length > 0) {
val = arr[i][j];
// remove any extra whitespace, tabs, newlines, etc.
val = val.replace(/\s\s+/g, ' ');
} else if (j === x && arr[i][j].length === 0) {
// write placeholder text in case value is empty
val = '*** place holder text ***';
console.log('wow val: ', val);
}
}
}
}
return val;
}
// cast data type from string to a number type
// if no data the value will be NaN and output value in JSON will be null
function filterFloat(value) {
value = value.replace(/ /g,'');
if(/^(\-|\+)?([0-9]+(\.[0-9]+)?|Infinity)$/
.test(value))
return Number(value);
return NaN;
}
// takes input data and assigns it to country-data.json
function formatData(country, dataClass, arr) {
var index = 0;
if (country==="ca") {
index = 1;
} else if (country==="zh") {
index = 2;
}
// console.log(country, ' ', dataClass, ' test: ', iterateMultiArray(arr,2,21));
outData[country].production.coal[dataClass] = iterateMultiArray(arr,2,2);
outData[country].production.gas[dataClass] = iterateMultiArray(arr,2,3);
outData[country].production.oil[dataClass] = iterateMultiArray(arr,2,4);
outData[country].production.nuclear[dataClass] = iterateMultiArray(arr,2,5);
outData[country].production.hydro[dataClass] = iterateMultiArray(arr,2,6);
outData[country].production.renewables[dataClass] = iterateMultiArray(arr,2,7);
outData[country].imports.coal[dataClass] = iterateMultiArray(arr,2,10);
outData[country].imports.gas[dataClass] = iterateMultiArray(arr,2, 11);
outData[country].imports.crude_oil[dataClass] = iterateMultiArray(arr,2,12);
outData[country].imports.oil_products[dataClass] = iterateMultiArray(arr,2,13);
outData[country].imports.electricity[dataClass] = iterateMultiArray(arr,2,14);
outData[country].exports.coal[dataClass] = iterateMultiArray(arr,2,17);
outData[country].exports.gas[dataClass] = iterateMultiArray(arr,2,18);
outData[country].exports.crude_oil[dataClass] = iterateMultiArray(arr,2,19);
outData[country].exports.oil_products[dataClass] = iterateMultiArray(arr,2,20);
outData[country].exports.electricity[dataClass] = iterateMultiArray(arr,2,21);
outData[country].consumption.power_sector[dataClass] = iterateMultiArray(arr,2,24);
outData[country].consumption.transport[dataClass] = iterateMultiArray(arr,2,25);
outData[country].consumption.industry[dataClass] = iterateMultiArray(arr,2,26);
outData[country].consumption.residential[dataClass] = iterateMultiArray(arr,2,27);
outData[country].consumption.commercial[dataClass] = iterateMultiArray(arr,2,28);
outData[country].impact.co2_emissions[dataClass] = iterateMultiArray(arr,2,31);
outData[country].impact.water_use[dataClass] = iterateMultiArray(arr,2,32);
outData[country].impact.thermal_power_sector_losses[dataClass] = iterateMultiArray(arr,2,33);
}
// writes the data sources to country-data.json
function formatSources(arr) {
var i = 0, l=arr.length, toWrite=null;
// var i corresponds to line # -1 in sources.csv
for (i; i<l; i++) {
toWrite = iterateMultiArraySource(arr,1,i);
switch(i) {
case 2:
outData['us'].production.coal.source = toWrite[2];
outData['ca'].production.coal.source = toWrite[0];
outData['zh'].production.coal.source = toWrite[1];
case 3:
outData['us'].production.gas.source = toWrite[2];
outData['ca'].production.gas.source = toWrite[0];
outData['zh'].production.gas.source = toWrite[1];
case 4:
outData['us'].production.oil.source = toWrite[2];
outData['ca'].production.oil.source = toWrite[0];
outData['zh'].production.oil.source = toWrite[1];
case 5:
outData['us'].production.nuclear.source = toWrite[2];
outData['ca'].production.nuclear.source = toWrite[0];
outData['zh'].production.nuclear.source = toWrite[1];
case 6:
outData['us'].production.hydro.source = toWrite[2];
outData['ca'].production.hydro.source = toWrite[0];
outData['zh'].production.hydro.source = toWrite[1];
case 7:
outData['us'].production.renewables.source = toWrite[2];
outData['ca'].production.renewables.source = toWrite[0];
outData['zh'].production.renewables.source = toWrite[1];
case 10:
outData['us'].imports.coal.source = toWrite[2];
outData['ca'].imports.coal.source = toWrite[0];
outData['zh'].imports.coal.source = toWrite[1];
case 11:
outData['us'].imports.gas.source = toWrite[2];
outData['ca'].imports.gas.source = toWrite[0];
outData['zh'].imports.gas.source = toWrite[1];
case 12:
outData['us'].imports.crude_oil.source = toWrite[2];
outData['ca'].imports.crude_oil.source = toWrite[0];
outData['zh'].imports.crude_oil.source = toWrite[1];
case 13:
outData['us'].imports.oil_products.source = toWrite[2];
outData['ca'].imports.oil_products.source = toWrite[0];
outData['zh'].imports.oil_products.source = toWrite[1];
case 14:
outData['us'].imports.electricity.source = toWrite[2];
outData['ca'].imports.electricity.source = toWrite[0];
outData['zh'].imports.electricity.source = toWrite[1];
case 17:
outData['us'].exports.coal.source = toWrite[2];
outData['ca'].exports.coal.source = toWrite[0];
outData['zh'].exports.coal.source = toWrite[1];
case 18:
outData['us'].exports.gas.source = toWrite[2];
outData['ca'].exports.gas.source = toWrite[0];
outData['zh'].exports.gas.source = toWrite[1];
case 19:
outData['us'].exports.crude_oil.source = toWrite[2];
outData['ca'].exports.crude_oil.source = toWrite[0];
outData['zh'].exports.crude_oil.source = toWrite[1];
case 20:
outData['us'].exports.oil_products.source = toWrite[2];
outData['ca'].exports.oil_products.source = toWrite[0];
outData['zh'].exports.oil_products.source = toWrite[1];
case 21:
outData['us'].exports.electricity.source = toWrite[2];
outData['ca'].exports.electricity.source = toWrite[0];
outData['zh'].exports.electricity.source = toWrite[1];
case 24:
outData['us'].consumption.power_sector.source = toWrite[2];
outData['ca'].consumption.power_sector.source = toWrite[0];
outData['zh'].consumption.power_sector.source = toWrite[1];
case 25:
outData['us'].consumption.transport.source = toWrite[2];
outData['ca'].consumption.transport.source = toWrite[0];
outData['zh'].consumption.transport.source = toWrite[1];
case 26:
outData['us'].consumption.industry.source = toWrite[2];
outData['ca'].consumption.industry.source = toWrite[0];
outData['zh'].consumption.industry.source = toWrite[1];
case 27:
outData['us'].consumption.residential.source = toWrite[2];
outData['ca'].consumption.residential.source = toWrite[0];
outData['zh'].consumption.residential.source = toWrite[1];
case 28:
outData['us'].consumption.commercial.source = toWrite[2];
outData['ca'].consumption.commercial.source = toWrite[0];
outData['zh'].consumption.commercial.source = toWrite[1];
case 31:
outData['us'].impact.co2_emissions.source = toWrite[2];
outData['ca'].impact.co2_emissions.source = toWrite[0];
outData['zh'].impact.co2_emissions.source = toWrite[1];
case 32:
outData['us'].impact.water_use.source = toWrite[2];
outData['ca'].impact.water_use.source = toWrite[0];
outData['zh'].impact.water_use.source = toWrite[1];
case 33:
outData['us'].impact.thermal_power_sector_losses.source = toWrite[2];
outData['ca'].impact.thermal_power_sector_losses.source = toWrite[0];
outData['zh'].impact.thermal_power_sector_losses.source = toWrite[1];
}
}
}
// filters wow factor arrays for per capita & total data
function formatWows(arr) {
writeWows(arr, 'total', 2);
writeWows(arr, 'per_capita', 3);
}
// writes wow factors to wow-factor.json
function writeWows(arr, cat, x) {
// console.log(cat, ': ', iterateMultiArrayWow(arr,x,3));
wowData[cat].production.coal = iterateMultiArrayWow(arr,x,2);
wowData[cat].production.gas = iterateMultiArrayWow(arr,x,3);
wowData[cat].production.oil = iterateMultiArrayWow(arr,x,4);
wowData[cat].production.nuclear = iterateMultiArrayWow(arr,x,5);
wowData[cat].production.hydro = iterateMultiArrayWow(arr,x,6);
wowData[cat].production.renewables = iterateMultiArrayWow(arr,x,7);
wowData[cat].imports.coal = iterateMultiArrayWow(arr,x,10);
wowData[cat].imports.gas = iterateMultiArrayWow(arr,x, 11);
wowData[cat].imports.crude_oil = iterateMultiArrayWow(arr,x,12);
wowData[cat].imports.oil_products = iterateMultiArrayWow(arr,x,13);
wowData[cat].imports.electricity = iterateMultiArrayWow(arr,x,14);
wowData[cat].exports.coal = iterateMultiArrayWow(arr,x,17);
wowData[cat].exports.gas = iterateMultiArrayWow(arr,x,18);
wowData[cat].exports.crude_oil = iterateMultiArrayWow(arr,x,19);
wowData[cat].exports.oil_products = iterateMultiArrayWow(arr,x,20);
wowData[cat].exports.electricity = iterateMultiArrayWow(arr,x,21);
wowData[cat].consumption.power_sector = iterateMultiArrayWow(arr,x,24);
wowData[cat].consumption.transport = iterateMultiArrayWow(arr,x,25);
wowData[cat].consumption.industry = iterateMultiArrayWow(arr,x,26);
wowData[cat].consumption.residential = iterateMultiArrayWow(arr,x,27);
wowData[cat].consumption.commercial = iterateMultiArrayWow(arr,x,28);
wowData[cat].impact.co2_emissions = iterateMultiArrayWow(arr,x,31);
wowData[cat].impact.thermal_power_sector_losses = iterateMultiArrayWow(arr,x,32);
wowData[cat].impact.water_use = iterateMultiArrayWow(arr,x,33);
}
// writes the final JSON files to stdout
function writeOutFile(filePath, data) {
fs.writeFile(filePath, data, function(err) {
if (err) return console.log('write error: ', err);
console.log("sucess!");
});
}
// starts the script
readOutFiles(writeFilePaths, begin);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment