Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
node js script to parse csv data into json for the Woodrow Wilson Center's "Tracking the Energy Titans" data-viz
/* Import modules */
var csv = require('fast-csv'),
fs = require('fs');
/* "global" variables */
var inputFilePath = '../data/csv/',
writeFilePaths = ['../data/country-data.json','../data/wow-factor.json']
rawData = {},
outData = null,
wowData = null,
numFiles = null,
count = 0;
/*
Process:
1. read CSV files from inputFilePath
2. read JSON files from writeFilePath (poor variable naming, I know)
3. for each CSV parse data & assign values to JSON
4. write country level data & sources to ./country-data.json
5. write the wow factors to ./wow-factor.json
Notes:
- In parseData() once all the CSV input files have been read and parsed
the output files will be written to the directory containing this script.
- Optionally you may comment out the writeOutFile() calls and instead console.log
the data to inspect it.
- A helpful way to do this is on the command line is by piping the output:
node parse-data.js | python -m json.tool | less
*/
// read the output JSON files to grab the data structure
function readOutFiles(filePathArray, callback){
fs.readFile(filePathArray[0],function(err,data) {
if (err) {
return console.log('read file error: ', err);
}
outData = JSON.parse(data);
});
fs.readFile(filePathArray[1],function(err,data) {
if (err) {
return console.log('read file error: ', err);
}
wowData = JSON.parse(data);
});
if (callback && typeof callback === 'function') {
callback();
}
}
// get it all going
function begin(){
fs.readdir(inputFilePath, function(err, files){
// returns an array of file names
numFiles = files.length;
for (var i = 0; i<files.length; i++) {
parseFile(files[i]);
}
});
}
// iterate over the CSV files
// converts CSV data to multi-dimensional arrays
// stores temporarily in rawData
function parseFile(fileName) {
var chunk = [];
csv
.fromPath(inputFilePath + fileName)
.on('data', function(data, error){
if (error) console.log('csv parse error: ', error);
// console.log(fileName, ': ', data);
chunk.push(data);
})
.on('end', function(){
rawData[fileName] = chunk;
count ++;
if (count === numFiles) {
// console.log('done: ', JSON.stringify(rawData), '\n');
parseData(rawData);
}
});
}
// parses multi-dimensional arrays from csv module
function parseData(parsedCSV) {
var count = 0;
// console.log('parsedCSV: ', parsedCSV);
for (var k in parsedCSV) {
var obj = parsedCSV[k];
if (k === 'us_total.csv') {
formatData('us', 'total', obj);
} else if (k === 'canada_total.csv') {
formatData('ca', 'total', obj);
} else if (k === 'china_total.csv') {
formatData('zh', 'total', obj);
} else if (k === 'us_per_capita.csv') {
formatData('us', 'per_capita', obj);
} else if (k === 'canada_per_capita.csv') {
formatData('ca', 'per_capita', obj);
} else if (k === 'china_per_capita.csv') {
formatData('zh', 'per_capita', obj);
} else if (k === 'wow_factor.csv') {
formatWows(obj);
} else if (k==='sources.csv') {
formatSources(obj);
}
count +=1;
if (count===numFiles) {
/*
comment these two lines out to not write output files
*/
writeOutFile("country-data.json", JSON.stringify(outData));
writeOutFile("wow-factor.json", JSON.stringify(wowData));
/*
uncomment these two lines to test output
*/
// console.log(JSON.stringify(outData));
// console.log(JSON.stringify(wowData));
}
}
}
function iterateMultiArray(arr,x,y) {
// iterates over a multi-dimensional array
// returns an array of numeric values
var i = 0, l = arr.length, arrToReturn = [];
for (i; i<l; i++) {
if (i===y) {
for (var j=0; j<arr[i].length; j++) {
if (j>x) {
var tmp = arr[i][j]
arrToReturn.push(filterFloat(tmp));
}
}
}
}
return mapData2Years(arrToReturn);
}
function mapData2Years(arr) {
// maps each value of an array to its corresponding year
// preferred data format for d3
var years = [2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014];
var toReturn = [];
for (var i = 0; i < arr.length; i++) {
var obj = {};
obj.val = arr[i];
obj.year = years[i];
// only add the data if it isn't null
if (obj.val >= 0) {
toReturn.push(obj);
}
}
return toReturn;
}
function iterateMultiArraySource(arr,x,y) {
// iterates over a multi-dimensional array
// returns an array of text values
var i = 0, l = arr.length, arrToReturn = [];
for (i; i<l; i++) {
if (i===y) {
for (var j=0; j<arr[i].length; j++) {
if (j>x) {
var tmp = arr[i][j]
arrToReturn.push(tmp);
}
}
}
}
return arrToReturn;
}
function iterateMultiArrayWow(arr,x,y) {
// iterates over a multi-dimensional array for Wow Factor data
// returns a string of text for the wow factor
var i = 0, l = arr.length, val;
for (i; i<l; i++) {
if (i === y) {
for (var j=0; j<arr[i].length; j++) {
if (j === x && arr[i][j].length > 0) {
val = arr[i][j];
// remove any extra whitespace, tabs, newlines, etc.
val = val.replace(/\s\s+/g, ' ');
} else if (j === x && arr[i][j].length === 0) {
// write placeholder text in case value is empty
val = '*** place holder text ***';
console.log('wow val: ', val);
}
}
}
}
return val;
}
// cast data type from string to a number type
// if no data the value will be NaN and output value in JSON will be null
function filterFloat(value) {
value = value.replace(/ /g,'');
if(/^(\-|\+)?([0-9]+(\.[0-9]+)?|Infinity)$/
.test(value))
return Number(value);
return NaN;
}
// takes input data and assigns it to country-data.json
function formatData(country, dataClass, arr) {
var index = 0;
if (country==="ca") {
index = 1;
} else if (country==="zh") {
index = 2;
}
// console.log(country, ' ', dataClass, ' test: ', iterateMultiArray(arr,2,21));
outData[country].production.coal[dataClass] = iterateMultiArray(arr,2,2);
outData[country].production.gas[dataClass] = iterateMultiArray(arr,2,3);
outData[country].production.oil[dataClass] = iterateMultiArray(arr,2,4);
outData[country].production.nuclear[dataClass] = iterateMultiArray(arr,2,5);
outData[country].production.hydro[dataClass] = iterateMultiArray(arr,2,6);
outData[country].production.renewables[dataClass] = iterateMultiArray(arr,2,7);
outData[country].imports.coal[dataClass] = iterateMultiArray(arr,2,10);
outData[country].imports.gas[dataClass] = iterateMultiArray(arr,2, 11);
outData[country].imports.crude_oil[dataClass] = iterateMultiArray(arr,2,12);
outData[country].imports.oil_products[dataClass] = iterateMultiArray(arr,2,13);
outData[country].imports.electricity[dataClass] = iterateMultiArray(arr,2,14);
outData[country].exports.coal[dataClass] = iterateMultiArray(arr,2,17);
outData[country].exports.gas[dataClass] = iterateMultiArray(arr,2,18);
outData[country].exports.crude_oil[dataClass] = iterateMultiArray(arr,2,19);
outData[country].exports.oil_products[dataClass] = iterateMultiArray(arr,2,20);
outData[country].exports.electricity[dataClass] = iterateMultiArray(arr,2,21);
outData[country].consumption.power_sector[dataClass] = iterateMultiArray(arr,2,24);
outData[country].consumption.transport[dataClass] = iterateMultiArray(arr,2,25);
outData[country].consumption.industry[dataClass] = iterateMultiArray(arr,2,26);
outData[country].consumption.residential[dataClass] = iterateMultiArray(arr,2,27);
outData[country].consumption.commercial[dataClass] = iterateMultiArray(arr,2,28);
outData[country].impact.co2_emissions[dataClass] = iterateMultiArray(arr,2,31);
outData[country].impact.water_use[dataClass] = iterateMultiArray(arr,2,32);
outData[country].impact.thermal_power_sector_losses[dataClass] = iterateMultiArray(arr,2,33);
}
// writes the data sources to country-data.json
function formatSources(arr) {
var i = 0, l=arr.length, toWrite=null;
// var i corresponds to line # -1 in sources.csv
for (i; i<l; i++) {
toWrite = iterateMultiArraySource(arr,1,i);
switch(i) {
case 2:
outData['us'].production.coal.source = toWrite[2];
outData['ca'].production.coal.source = toWrite[0];
outData['zh'].production.coal.source = toWrite[1];
case 3:
outData['us'].production.gas.source = toWrite[2];
outData['ca'].production.gas.source = toWrite[0];
outData['zh'].production.gas.source = toWrite[1];
case 4:
outData['us'].production.oil.source = toWrite[2];
outData['ca'].production.oil.source = toWrite[0];
outData['zh'].production.oil.source = toWrite[1];
case 5:
outData['us'].production.nuclear.source = toWrite[2];
outData['ca'].production.nuclear.source = toWrite[0];
outData['zh'].production.nuclear.source = toWrite[1];
case 6:
outData['us'].production.hydro.source = toWrite[2];
outData['ca'].production.hydro.source = toWrite[0];
outData['zh'].production.hydro.source = toWrite[1];
case 7:
outData['us'].production.renewables.source = toWrite[2];
outData['ca'].production.renewables.source = toWrite[0];
outData['zh'].production.renewables.source = toWrite[1];
case 10:
outData['us'].imports.coal.source = toWrite[2];
outData['ca'].imports.coal.source = toWrite[0];
outData['zh'].imports.coal.source = toWrite[1];
case 11:
outData['us'].imports.gas.source = toWrite[2];
outData['ca'].imports.gas.source = toWrite[0];
outData['zh'].imports.gas.source = toWrite[1];
case 12:
outData['us'].imports.crude_oil.source = toWrite[2];
outData['ca'].imports.crude_oil.source = toWrite[0];
outData['zh'].imports.crude_oil.source = toWrite[1];
case 13:
outData['us'].imports.oil_products.source = toWrite[2];
outData['ca'].imports.oil_products.source = toWrite[0];
outData['zh'].imports.oil_products.source = toWrite[1];
case 14:
outData['us'].imports.electricity.source = toWrite[2];
outData['ca'].imports.electricity.source = toWrite[0];
outData['zh'].imports.electricity.source = toWrite[1];
case 17:
outData['us'].exports.coal.source = toWrite[2];
outData['ca'].exports.coal.source = toWrite[0];
outData['zh'].exports.coal.source = toWrite[1];
case 18:
outData['us'].exports.gas.source = toWrite[2];
outData['ca'].exports.gas.source = toWrite[0];
outData['zh'].exports.gas.source = toWrite[1];
case 19:
outData['us'].exports.crude_oil.source = toWrite[2];
outData['ca'].exports.crude_oil.source = toWrite[0];
outData['zh'].exports.crude_oil.source = toWrite[1];
case 20:
outData['us'].exports.oil_products.source = toWrite[2];
outData['ca'].exports.oil_products.source = toWrite[0];
outData['zh'].exports.oil_products.source = toWrite[1];
case 21:
outData['us'].exports.electricity.source = toWrite[2];
outData['ca'].exports.electricity.source = toWrite[0];
outData['zh'].exports.electricity.source = toWrite[1];
case 24:
outData['us'].consumption.power_sector.source = toWrite[2];
outData['ca'].consumption.power_sector.source = toWrite[0];
outData['zh'].consumption.power_sector.source = toWrite[1];
case 25:
outData['us'].consumption.transport.source = toWrite[2];
outData['ca'].consumption.transport.source = toWrite[0];
outData['zh'].consumption.transport.source = toWrite[1];
case 26:
outData['us'].consumption.industry.source = toWrite[2];
outData['ca'].consumption.industry.source = toWrite[0];
outData['zh'].consumption.industry.source = toWrite[1];
case 27:
outData['us'].consumption.residential.source = toWrite[2];
outData['ca'].consumption.residential.source = toWrite[0];
outData['zh'].consumption.residential.source = toWrite[1];
case 28:
outData['us'].consumption.commercial.source = toWrite[2];
outData['ca'].consumption.commercial.source = toWrite[0];
outData['zh'].consumption.commercial.source = toWrite[1];
case 31:
outData['us'].impact.co2_emissions.source = toWrite[2];
outData['ca'].impact.co2_emissions.source = toWrite[0];
outData['zh'].impact.co2_emissions.source = toWrite[1];
case 32:
outData['us'].impact.water_use.source = toWrite[2];
outData['ca'].impact.water_use.source = toWrite[0];
outData['zh'].impact.water_use.source = toWrite[1];
case 33:
outData['us'].impact.thermal_power_sector_losses.source = toWrite[2];
outData['ca'].impact.thermal_power_sector_losses.source = toWrite[0];
outData['zh'].impact.thermal_power_sector_losses.source = toWrite[1];
}
}
}
// filters wow factor arrays for per capita & total data
function formatWows(arr) {
writeWows(arr, 'total', 2);
writeWows(arr, 'per_capita', 3);
}
// writes wow factors to wow-factor.json
function writeWows(arr, cat, x) {
// console.log(cat, ': ', iterateMultiArrayWow(arr,x,3));
wowData[cat].production.coal = iterateMultiArrayWow(arr,x,2);
wowData[cat].production.gas = iterateMultiArrayWow(arr,x,3);
wowData[cat].production.oil = iterateMultiArrayWow(arr,x,4);
wowData[cat].production.nuclear = iterateMultiArrayWow(arr,x,5);
wowData[cat].production.hydro = iterateMultiArrayWow(arr,x,6);
wowData[cat].production.renewables = iterateMultiArrayWow(arr,x,7);
wowData[cat].imports.coal = iterateMultiArrayWow(arr,x,10);
wowData[cat].imports.gas = iterateMultiArrayWow(arr,x, 11);
wowData[cat].imports.crude_oil = iterateMultiArrayWow(arr,x,12);
wowData[cat].imports.oil_products = iterateMultiArrayWow(arr,x,13);
wowData[cat].imports.electricity = iterateMultiArrayWow(arr,x,14);
wowData[cat].exports.coal = iterateMultiArrayWow(arr,x,17);
wowData[cat].exports.gas = iterateMultiArrayWow(arr,x,18);
wowData[cat].exports.crude_oil = iterateMultiArrayWow(arr,x,19);
wowData[cat].exports.oil_products = iterateMultiArrayWow(arr,x,20);
wowData[cat].exports.electricity = iterateMultiArrayWow(arr,x,21);
wowData[cat].consumption.power_sector = iterateMultiArrayWow(arr,x,24);
wowData[cat].consumption.transport = iterateMultiArrayWow(arr,x,25);
wowData[cat].consumption.industry = iterateMultiArrayWow(arr,x,26);
wowData[cat].consumption.residential = iterateMultiArrayWow(arr,x,27);
wowData[cat].consumption.commercial = iterateMultiArrayWow(arr,x,28);
wowData[cat].impact.co2_emissions = iterateMultiArrayWow(arr,x,31);
wowData[cat].impact.thermal_power_sector_losses = iterateMultiArrayWow(arr,x,32);
wowData[cat].impact.water_use = iterateMultiArrayWow(arr,x,33);
}
// writes the final JSON files to stdout
function writeOutFile(filePath, data) {
fs.writeFile(filePath, data, function(err) {
if (err) return console.log('write error: ', err);
console.log("sucess!");
});
}
// starts the script
readOutFiles(writeFilePaths, begin);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment