Instantly share code, notes, and snippets.

Embed
What would you like to do?
Processing of 50 million rows benchmark file using my Orzo.js (http://www.orzojs.org/). The task is to calculate count, max, min, avg. Inspired by an article at http://padak.keboola.com/agregace-v-mongodb-oracle-redshift-bigquery-voltdb-vertica-elasticsearch-a-gooddata. Processing time on Intel i2700 (average from several attempts): 142sec
dataChunks(4, function (idx) {
return orzo.fileChunkReader(env.inputArgs[0], idx, null, 1);
});
applyItems(function (dataChunk, map) {
while (dataChunk.hasNext()) {
map(dataChunk.next());
}
});
map(function (data) {
var srch;
srch = /^(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})Z,(.+)/.exec(data);
if (srch) {
emit(srch[1] + '-' + srch[2] + '-' + srch[3], parseFloat(srch[7]));
}
});
reduce(6, function (key, values) {
var data = D(values);
emit(key, [data.average(), data.size(), data.min(), data.max()]);
});
finish(function (results) {
doWith(orzo.fileWriter('./test2.csv'), function () {
var self = this;
self.write(orzo.sprintf('date,count,average,min,max\n'));
results.each(function (key, values) {
self.write(orzo.sprintf('%s,%01.5f,%01.5f,%01.5f\n', key, values[0][0], values[0][1], values[0][2]));
});
});
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment