Skip to content

Instantly share code, notes, and snippets.

@kmpm
Forked from dmitryame/Calculate standard deviation in mongo
Created September 12, 2011 16:41
Show Gist options
  • Star 7 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kmpm/1211724 to your computer and use it in GitHub Desktop.
Save kmpm/1211724 to your computer and use it in GitHub Desktop.
mongo db Standard deviation calculation with map reduce using the Welford algoritm.
/*
* (The MIT License)
*
* Copyright (c) 2011 Peter Magnusson <kmpm@birchroad.net>
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* 'Software'), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
*/
/**
* References
* http://www.johndcook.com/standard_deviation.html
* http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
*/
/**
* The RunningStat Class/Object is the actual heavy worker in this and the only thing you really want.
*/
var RunningStat = function(){
this.count=0; //counter for the number of samples
//use .push to add values to be calculated
this.push = function(x){
this.count++;
if(this.count==1){
this.old_mean = x;
this.new_mean = x;
this.old_sum = 0.0;
}
else{
this.new_mean = this.old_mean + (x - this.old_mean)/this.count;
this.new_sum = this.old_sum + (x - this.old_mean)*(x - this.new_mean);
this.old_mean = this.new_mean;
this.old_sum = this.new_sum;
}
};
//returns the mean
this.mean = function(){
return (this.count >0) ? this.new_mean : 0.0;
};
//returns the variance of the data
this.variance = function(){
return (this.count > 1) ? (this.new_sum/(this.count -1)) : 0.0;
};
//returns the standard deviation
this.standardDeviation = function(){
return Math.sqrt(this.variance());
};
}
//to be able to use a function within map/reduce and have it declared externally
//then you have to save it in the database first
db.system.js.save({_id:'RunningStat', value:RunningStat});
// map+reduce sample implementation
// sample data
{ "_id" : ObjectId("4caf19200d282159bf000001"), "date" : "2010-10-06", "seq" : "00:00:00,000", "method" : "getUserByScbeId", "duration" : 3 }
{ "_id" : ObjectId("4caf19200d282159bf000002"), "date" : "2010-10-06", "seq" : "00:00:00,116", "method" : "createTicket", "duration" : 116 }
{ "_id" : ObjectId("4caf19200d282159bf000003"), "date" : "2010-10-06", "seq" : "00:00:00,131", "method" : "getCollectionMetadata", "duration" : 11 }
{ "_id" : ObjectId("4caf19200d282159bf000004"), "date" : "2010-10-06", "seq" : "00:00:00,137", "method" : "getParticipation", "duration" : 6 }
{ "_id" : ObjectId("4caf19200d282159bf000005"), "date" : "2010-10-06", "seq" : "00:00:00,139", "method" : "updateSocialObjectModified", "duration" : 371 }
{ "_id" : ObjectId("4caf19200d282159bf000006"), "date" : "2010-10-06", "seq" : "00:00:00,143", "method" : "getUserByScbeId", "duration" : 4 }
map = function() {
emit(this.method, { duration : this.duration, count : 1});
}
var reduce = function(key,emits) {
var n = { count : 0, duration : 0, min : Number.MAX_VALUE, max : Number.MIN_VALUE };
var rs = new RunningStat();
for(var i in emits) {
var x = emits[i].duration;
n.count += emits[i].count;
n.duration += x;
n.max = (x>n.max) ? x: n.max;
n.min = (x<n.min) ? x: n.min;
rs.push(x);
}
n.avg=rs.mean();
n.variance=rs.variance();
n.stddv=rs.standardDeviation();
return n;
}
//since 1.7+ something you have to provide a output collection
db.logs.mapReduce(map, reduce, { out: "methods_stat"});
db.methods_stat.find();
@kmpm
Copy link
Author

kmpm commented Jun 26, 2012

@newleafdigital Thanks for the tip... when I did this one I couldn't find any stddev at all so I cooked this one up but you are correct, it's not idempotent and with the alternative you gave... I might not bother fixig this.... :)

@Pyrolistical
Copy link

@bebuckman I've made improvements to that alternative: https://gist.github.com/Pyrolistical/8139958

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment