Skip to content

Instantly share code, notes, and snippets.

@psaia
Created June 3, 2015 17:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save psaia/26398d9a4dec2f8b2e5f to your computer and use it in GitHub Desktop.
Save psaia/26398d9a4dec2f8b2e5f to your computer and use it in GitHub Desktop.
parse
#!/usr/bin/env node
var fs = require('fs');
var path = require('path');
var async = require('async');
var glob = require('glob');
var slugify = require('slugify');
var parseCSV = require('csv-parse');
var rimraf = require('rimraf');
var d3 = require('d3');
var iconv = require('iconv-lite');
var gm = require('gm').subClass({ imageMagick: true });
var _ = require('lodash');
var join = path.join;
var steps = [];
var PROCESSED_FOLDER = join(__dirname, 'processed');
var LOCAL_RAW_FILES = join(__dirname, 'raw');
function createProcessedFolder(done) {
async.waterfall([
function(callback) {
fs.exists(PROCESSED_FOLDER, function(exists) {
callback(null, exists);
});
},
function(exists, callback) {
if (exists) {
return rimraf(PROCESSED_FOLDER, callback);
}
callback();
},
async.apply(fs.mkdir.bind(fs, PROCESSED_FOLDER), done)
]);
}
function heatmapdData(done) {
var data = {};
var DEST_FILE = join(PROCESSED_FOLDER, 'heatmap-data.json');
// Map data keys to one letter values for smaller file size.
var valMap = {
'urbanization': 'u',
'fertility-rates': 'f',
'life-expectancy': 'l'
};
var addTheHeat = function() {
var getScale = function(arr, colors) {
var scale = d3.scale.linear();
var min = Math.min.apply(Math, arr);
var max = Math.max.apply(Math, arr);
if (Math.abs(min) !== Infinity && Math.abs(max) !== Infinity) {
scale.domain([min, max]);
scale.range(colors);
return scale;
}
return false;
};
var isNumber = function(num) {
return _.isNumber(num) && Math.abs(num) !== Infinity && !isNaN(num);
};
// Drop empty states.
_.forEach(data, function(stateList, yearKey) { // Year year.
_.forEach(stateList, function(state, stateISOKey) {
if (!isNumber(state.u) && !isNumber(state.f) && !isNumber(state.l)
&& state.u !== undefined && state.f !== undefined && state.l !== undefined) {
delete data[yearKey][stateISOKey];
}
});
});
_.forEach(data, function(stateList, yearKey) { // Year year.
var urbanizationVals = [];
var fertilityVals = [];
var lifeVals = [];
var urbanScale;
var lifeScale;
var fertilityScale;
_.forEach(stateList, function(state, stateISOKey) {
if (isNumber(state.u)) {
urbanizationVals.push(state.u);
}
if (isNumber(state.f)) {
fertilityVals.push(state.f);
}
if (isNumber(state.l)) {
lifeVals.push(state.l);
}
});
urbanScale = getScale(_.compact(urbanizationVals), ['#edf8b1', '#7fcdbb', '#2c7fb8']);
lifeScale = getScale(_.compact(lifeVals), ['#ece2f0', '#a6bddb', '#1c9099']);
fertilityScale = getScale(_.compact(fertilityVals), ['#e5f5f9', '#99d8c9', '#2ca25f']);
_.forEach(stateList, function(state, stateKey) {
if (state.l !== null) {
if (lifeScale) {
data[yearKey][stateKey].lC = lifeScale(state.l);
}
}
if (state.f !== null) {
if (fertilityScale) {
data[yearKey][stateKey].fC = fertilityScale(state.f);
}
}
if (state.u !== null) {
if (urbanScale) {
data[yearKey][stateKey].uC = urbanScale(state.u);
}
}
});
});
};
var normalizeYear = function(year) {
var y = parseInt(year, 10);
if (String(y).length === 2) {
if (y < 55) {
y = 2000 + y;
} else {
y = 1900 + y;
}
}
return y;
};
var appendValue = function(years, countryISO, dataType, val) {
var cur;
var from;
var to;
var type = valMap[dataType];
var value = val === 'NULL' ? null : val;
if (!type) {
throw new Error('Data type does not exist: ' + type);
}
// If a year span, append value for each year.
if (years[0] !== years[1] && years[1]) {
from = years[0];
to = years[1];
cur = from;
while (cur < to) {
if (!data[cur]) {
data[cur] = {};
}
if (!data[cur][countryISO]) {
data[cur][countryISO] = {};
}
data[cur][countryISO][type] = value;
cur++;
}
} else if (years[0] !== null) {
cur = years[0];
if (!data[cur]) {
data[cur] = {};
}
if (!data[cur][countryISO]) {
data[cur][countryISO] = {};
}
data[cur][countryISO][type] = value;
}
};
var populateDataForRow = function(dataType, row) {
var key;
var years;
var countryISO = row['ISO 2 Code'];
// normalize antartica
if (countryISO === 'ATA') {
countryISO = 'AQ';
}
// Loop through keys to find the ones like, 2000-10, 1900, 1900-60, ect...
for (key in row) {
if (/^[0-9-]+$/.test(key)) {
years = key.split('-').map(normalizeYear);
appendValue(years, countryISO, dataType, row[key]);
}
}
};
var readFile = function(file, callback) {
var dataType = path.basename(file).replace(/\..+$/, '');
fs.readFile(file, { encoding: 'utf8' }, function(err, contents) {
if (err) {
throw err;
}
parseCSV(contents, {
delimiter: ',',
skip_empty_lines: 1,
trim: 1,
columns: true,
auto_parse: 1
}, function(err2, rows) {
if (err2) {
throw err2;
}
rows.forEach(populateDataForRow.bind(null, dataType));
addTheHeat();
process.nextTick(callback);
});
});
};
async.waterfall([
async.apply(glob, join(LOCAL_RAW_FILES, 'heatmap/*.csv')),
function(files, callback) {
async.each(files, readFile, callback);
}
], function(err) {
if (err) {
throw err;
}
fs.writeFile(
DEST_FILE,
JSON.stringify(data),
{ encoding: 'utf8' },
function() {
console.log('Completed all heatmap data.');
done();
}
);
});
}
function co2Data(done) {
var data = {};
var precision = 4;
// Setup projection that cna be used to translate.
var width = 360;
var height = 180;
var prj = d3.geo.mercator()
.scale((width + 1) / 2 / Math.PI)
.translate([width / 2, height / 2]);
var allValues = [];
var shortenNum = function(num) {
return parseFloat(Number(num).toFixed(2));
};
var filterValues = function(val) {
return /[0-9]/.test(val) && val !== '' && !isNaN(parseFloat(val));
};
var accumulateValues = function(filedata) {
allValues.push(
Math.max.apply(null,
filedata
.split('\n')
.filter(filterValues)
.map(shortenNum)
)
);
};
var processData = function(filedata, year, callback) {
var grid = filedata.split('\n').filter(filterValues).map(shortenNum);
var chunked = _.chunk(grid, 360); // 360 lat x 180 lng
var total = Math.max.apply(null, allValues);
var opacityScale = d3.scale.linear()
.domain([0, total])
.range([0, 1]);
var colorScale = d3.scale.linear()
.domain([0, total])
.range(['#B8DFFF', '#C70000']);
var x = 0;
var y = 0;
var opacity;
var coord;
chunked.forEach(function(row, rowIndex) {
x = 0;
if (rowIndex % precision === 0) {
row.forEach(function(col, index) {
opacity = shortenNum(opacityScale(col));
if (opacity > 0 && index !== 0 && index % precision === 0) {
coord = prj.invert([x, y]);
coord[0] = shortenNum(coord[0]);
coord[1] = shortenNum(coord[1]);
year.push({
coord: coord,
fill: colorScale(col),
opacity: opacity
});
}
x++;
});
}
y++;
});
callback(null);
};
var readFileAccumulate = function(file, callback) {
fs.readFile(file, { encoding: 'utf8' }, function(err, contents) {
if (err) {
throw err;
}
accumulateValues(contents);
process.nextTick(callback);
});
};
var readFileProcess = function(file, callback) {
var year = file.match(/[\d]+$/)[0];
data[year] = [];
fs.readFile(file, { encoding: 'utf8' }, function(err, contents) {
if (err) {
throw err;
}
processData(contents, data[year], callback);
});
};
var _files;
async.waterfall([
async.apply(glob, join(LOCAL_RAW_FILES, 'fossil-fuel/gridcar.*')),
function(files, callback) {
_files = files;
async.each(files, readFileAccumulate, callback);
},
function(callback) {
async.each(_files, readFileProcess, callback);
}
], function(err) {
if (err) {
throw err;
}
fs.writeFile(
join(PROCESSED_FOLDER, 'fossil-fuel-circles.json'),
JSON.stringify(data),
{ encoding: 'utf8' },
function() {
console.info('Completed fossil fuel data.');
done();
}
);
});
}
function pointData(done) {
var DEST_FILE = join(PROCESSED_FOLDER, 'population-points.json');
var data = [];
var convert = function(parseErr, rows) {
rows.forEach(function(row) {
var obj = {
coord: [row[4], row[5]],
name: row[2],
location: row[7],
description: row[6],
year: row[3]
};
if (obj.name &&
typeof obj.coord[0] === 'number' &&
typeof obj.coord[1] === 'number') {
data.push(obj);
}
});
fs.writeFile(
DEST_FILE,
JSON.stringify(data),
function() {
console.info('Completed population points.');
done();
}
);
};
fs.createReadStream(join(
LOCAL_RAW_FILES,
'population-points/points.csv'
)).pipe(iconv.decodeStream('iso-8859-1')).pipe(parseCSV({
delimiter: ',',
skip_empty_lines: 1,
trim: 1,
auto_parse: 1
}, convert));
}
function milestoneData(done) {
var parser = parseCSV({
delimiter: ',',
skip_empty_lines: 1,
trim: 1,
auto_parse: 1
}, function _parserCb(err, rows) {
var data = [];
var cachedImages;
var DEST_FILE = join(PROCESSED_FOLDER, 'timeline-milestones.json');
var IMAGE_DIR = join(LOCAL_RAW_FILES, 'timeline-milestone', 'images');
var superSlugify = function(str) {
return slugify(path.basename(str))
.toLowerCase()
.replace('_', '-')
.replace(/[-]+/g, '-');
};
var allImages = function(allImagesComplete) {
glob(
join(IMAGE_DIR, '/*(*.jpg|*.jpeg|*.png|*.gif)'),
{ nocase: true },
allImagesComplete
);
};
var normalizeImages = function(imgNormalizationComplete) {
var normalize = function(img, normalized) {
var dir = path.dirname(img);
var filename = superSlugify(path.basename(img));
var newPath = join(dir, filename);
var gmImg;
fs.rename(img, newPath, function(err2) {
if (err2) {
throw err2;
}
gmImg = gm(newPath);
gmImg.size(function(err3, size) {
if (err3) {
throw err3;
}
gmImg = gm(newPath);
// gmImg.compress(80);
if (size.width > 1000) {
gmImg.resize(800, null);
gmImg.write(newPath, normalized);
} else if (size.height > 1000) {
gmImg.resize(null, 800);
gmImg.write(newPath, normalized);
} else {
normalized();
}
});
});
};
allImages(function(globErr, images) {
async.each(images, normalize, imgNormalizationComplete);
});
};
var categoryMap = function(cat) {
switch (cat.toLowerCase().trim().replace(';', '')) {
case 's&t': return 'Science & Technology';
case 'f&a': return 'Food & Agriculture';
case 'env': return 'Environment';
case 'p&s': return 'People & Society';
default: return cat;
}
};
var associatedImage = function(obj, imageComplete) {
var escapedTitle = superSlugify(obj.title)
.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&');
var i;
var len;
var img;
var exp;
var search = function() {
i = 0;
len = cachedImages.length;
for (; i < len; i++) {
img = cachedImages[i];
exp = new RegExp(
obj.year_begin +
'(?:-' +
obj.year_end +
')?' +
'-' +
escapedTitle +
'.+$'
);
if (exp.test(img)) {
return imageComplete(null, img);
}
}
imageComplete(null, false);
};
process.nextTick(search);
};
var processRow = function(row, callback) {
var obj = {
year_begin: row[0],
year_end: row[1],
category: row[2].split(' ').map(categoryMap),
title: row[3] || null,
description: row[4] || null,
image: null,
location: row[row.length - 1] || null
};
if (typeof obj.year_end === 'number' &&
typeof obj.year_begin === 'number' &&
obj.title) {
associatedImage(obj, function(err2, img) {
if (img) {
obj.image = img;
}
data.push(obj);
callback();
});
} else {
callback();
}
};
var onComplete = function(err2) {
if (err2) {
throw err2;
}
fs.writeFile(DEST_FILE, JSON.stringify(data), function(writeErr) {
if (writeErr) {
throw writeErr;
}
console.info('Completed timeline milestones.');
done();
}
);
};
if (err) {
return console.error(err);
}
normalizeImages(function(imageErr) {
if (imageErr) {
throw imageErr;
}
allImages(function(globErr, images) {
if (globErr) {
throw globErr;
}
cachedImages = images;
async.each(rows, processRow, onComplete);
});
});
});
fs.createReadStream(join(
LOCAL_RAW_FILES,
'timeline-milestone/timeline-milestones.csv'
)).pipe(iconv.decodeStream('iso-8859-1')).pipe(parser);
}
steps.push(createProcessedFolder);
steps.push(milestoneData);
steps.push(heatmapdData);
steps.push(pointData);
steps.push(co2Data);
async.waterfall(steps, function _done(err) {
if (err) {
console.error(err);
}
console.info('Done!');
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment