-
-
Save elifarley/1793dd71e8938d198aa7 to your computer and use it in GitHub Desktop.
Compute sample standard deviation, coefficient of variation and other stats over large datasets in RethinkDB using Map-Reduce in ReQL.
See https://github.com/rethinkdb/rethinkdb/issues/3769#issuecomment-77229538
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
> load('functions.js') | |
> db.stuff.drop() | |
false | |
> db.stuff.insert({value:1}) | |
> db.stuff.insert({value:2}) | |
> db.stuff.insert({value:2}) | |
> db.stuff.insert({value:2}) | |
> db.stuff.insert({value:3}) | |
> db.stuff.mapReduce(map, reduce, {finalize:finalize, out:{inline:1}}).results[0] | |
{ | |
"_id" : 1, | |
"value" : { | |
"sum" : 10, | |
"min" : 1, | |
"max" : 3, | |
"count" : 5, | |
"diff" : 2, | |
"avg" : 2, | |
"variance" : 0.4, | |
"stddev" : 0.6324555320336759 | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
r.table('t').group(r.row['G']).map(r.row['value']).map(lambda x: { | |
'count': 1, | |
'sum': x, | |
'min': x, | |
'max': x, | |
'diff': 0 # M2,n: sum((val-mean)^2) | |
}).reduce(lambda a, b: { | |
'count': r.add(a['count'], b['count']), | |
'sum': r.add(a['sum'], b['sum']), | |
'min': r.branch(a['min'].lt(b['min']), a['min'], b['min']), | |
'max': r.branch(a['max'].gt(b['max']), a['max'], b['max']), | |
# See https://gist.github.com/Pyrolistical/8139958 | |
# diff: a.diff + b.diff + avgdelta * avgdelta * weight | |
'diff': r.add( | |
a['diff'], | |
b['diff'], | |
r.do( | |
# temp helpers | |
# avgdelta: a.sum/a.count - b.sum/b.count; // a.mean - b.mean | |
# weight: a.count * b.count / (a.count + b.count) | |
r.sub(a['sum'].div(a['count']), b['sum'].div(b['count'])), | |
r.div(a['count'].mul(b['count']), a['count'].add(b['count'])), | |
lambda avgdelta, weight: r.mul(avgdelta, avgdelta, weight) | |
) | |
) | |
}).ungroup().map(lambda g: { | |
'G': g['group'], | |
'count': g['reduction']['count'], | |
'sum': g['reduction']['sum'], | |
'min': g['reduction']['min'], | |
'max': g['reduction']['max'], | |
# sample variance | |
's2': r.branch(g['reduction']['count'].gt(1), r.div(g['reduction']['diff'], g['reduction']['count'].sub(1)), 0) | |
}).merge(lambda d: r.do( | |
r.div(d['sum'], d['count']), # avg | |
r.branch(d['s2'].eq(0), 0, r.do( d['s2'], r.js('(function(x){ return Math.sqrt(x) })') )), # sample standard deviation | |
lambda avg, s: { | |
'avg': avg, | |
's': s, | |
'CV': r.branch(avg.eq(0), 0, r.div(s, avg)) | |
})) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment