Skip to content

Instantly share code, notes, and snippets.

@nmalkin
Forked from jedp/sample.js
Created June 7, 2012 17:32
Show Gist options
  • Save nmalkin/2890222 to your computer and use it in GitHub Desktop.
Save nmalkin/2890222 to your computer and use it in GitHub Desktop.
kpiggybank data simulator
#!/usr/bin/env node
/**
* Create sample interaction data, using valid kpiggybank
* data as a seed.
*
* Can use from the command line, like so:
*
* node sample.js 42 # gets 42 data blobs
*
* Or from a module:
*
* var sampler = require('./sampler');
*
* // Generate 10000 data points based on kpi data
* sampler.generate(10000, function(data) { ... });
*
* The program will always read all KPI data from kpiggybank,
* and create extra sample data as necessary by jittering original
* data blobs. It ensures that each event takes at least about
* 1/4 second.
*/
var http = require('http');
var KPIGGYBANK = 'kpiggybank.hacksign.in';
var JITTER_FACTOR = 0.2;
/**
* Download data from kpiggybank.hacksign.in
*/
function getDataDump(callback) {
var data = '';
http.get({
host: KPIGGYBANK,
path: '/wsapi/interaction_data'
}, function(res) {
res.on('data', function(chunk) {
data += chunk;
});
res.on('end', function() {
return callback(JSON.parse(data.toString()));
});
});
}
/**
* Extract the kpi blob from the couch record, keeping
* only those that have all the kpi fields we want.
* (Some may be missing timestamp, user_agent, etc.)
*
* Accepts obj as a dictionary of data; calls back with
* a list.
*/
function filterDataDump(obj, callback) {
var goodData = [];
var blob;
Object.keys(obj).forEach(function(key) {
blob = obj[key].value;
if (typeof blob.event_stream === 'object' &&
typeof blob.user_agent === 'object' &&
blob.user_agent.os !== 'Undefined' &&
blob.number_sites_logged_in) {
goodData.push(blob);
}
});
return callback(goodData);
}
/**
* Jitter each event in a stream
*/
function jitterEvents(eventStream) {
var newStream = [];
var jitter;
var fastest;
eventStream.forEach(function(tuple, index) {
// Note - this will make xhr events look like they took
// at least 1/4 sec. I'm assuming you don't care about
// xhr events.
jitter = Math.floor(tuple[1] * (Math.random() - 0.5) * (JITTER_FACTOR * 2));
fastest = 250 + Math.floor((Math.random() - 0.5) * 100);
newStream.push([
tuple[0], // the original event name
Math.max(fastest, tuple[1] + jitter) // Not about 1/4 sec
]);
});
return newStream;
}
/**
* Take each data point and multiply it a number of times,
* each time jittering the data slightly.
*/
function amplifyData(list, times, callback) {
var moreData = [];
var blob;
var i;
list.forEach(function(blob, index) {
moreData.push(blob);
for (i=0; i<times; i++) {
moreData.push({
_id: blob._id + '-' + i, // tag as a dup
event_stream: jitterEvents(blob.event_stream),
sample_rate: blob.sample_rate,
timestamp: blob.timestamp, // already rounded off
lang: blob.lang,
number_sites_logged_in: blob.number_sites_logged_in,
user_agent: blob.user_agent
});
}
});
return callback(moreData);
}
var generate = module.exports.generate = function generate(wantCount, callback) {
getDataDump(function(obj) {
filterDataDump(obj, function(data) {
if (wantCount > data.length) {
var times = Math.ceil(wantCount / data.length);
amplifyData(data, times, function(moreData) {
return callback(moreData.slice(0, wantCount));
});
} else {
return callback(data.slice(0, wantCount));
}
});
});
}
if (!module.parent) {
var wantCount = parseInt(process.argv[process.argv.length-1], 10) || 1000;
generate(wantCount, function(data) {
process.stdout.write(JSON.stringify(data, null, 4));
});
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment