Skip to content

Instantly share code, notes, and snippets.

@elifarley
Forked from RedBeard0531/functions.js
Last active August 29, 2015 14:16
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save elifarley/1793dd71e8938d198aa7 to your computer and use it in GitHub Desktop.
Save elifarley/1793dd71e8938d198aa7 to your computer and use it in GitHub Desktop.
Compute sample standard deviation, coefficient of variation and other stats over large datasets in RethinkDB using Map-Reduce in ReQL. See https://github.com/rethinkdb/rethinkdb/issues/3769#issuecomment-77229538
> load('functions.js')
> db.stuff.drop()
false
> db.stuff.insert({value:1})
> db.stuff.insert({value:2})
> db.stuff.insert({value:2})
> db.stuff.insert({value:2})
> db.stuff.insert({value:3})
> db.stuff.mapReduce(map, reduce, {finalize:finalize, out:{inline:1}}).results[0]
{
"_id" : 1,
"value" : {
"sum" : 10,
"min" : 1,
"max" : 3,
"count" : 5,
"diff" : 2,
"avg" : 2,
"variance" : 0.4,
"stddev" : 0.6324555320336759
}
}
r.table('t').group(r.row['G']).map(r.row['value']).map(lambda x: {
'count': 1,
'sum': x,
'min': x,
'max': x,
'diff': 0 # M2,n: sum((val-mean)^2)
}).reduce(lambda a, b: {
'count': r.add(a['count'], b['count']),
'sum': r.add(a['sum'], b['sum']),
'min': r.branch(a['min'].lt(b['min']), a['min'], b['min']),
'max': r.branch(a['max'].gt(b['max']), a['max'], b['max']),
# See https://gist.github.com/Pyrolistical/8139958
# diff: a.diff + b.diff + avgdelta * avgdelta * weight
'diff': r.add(
a['diff'],
b['diff'],
r.do(
# temp helpers
# avgdelta: a.sum/a.count - b.sum/b.count; // a.mean - b.mean
# weight: a.count * b.count / (a.count + b.count)
r.sub(a['sum'].div(a['count']), b['sum'].div(b['count'])),
r.div(a['count'].mul(b['count']), a['count'].add(b['count'])),
lambda avgdelta, weight: r.mul(avgdelta, avgdelta, weight)
)
)
}).ungroup().map(lambda g: {
'G': g['group'],
'count': g['reduction']['count'],
'sum': g['reduction']['sum'],
'min': g['reduction']['min'],
'max': g['reduction']['max'],
# sample variance
's2': r.branch(g['reduction']['count'].gt(1), r.div(g['reduction']['diff'], g['reduction']['count'].sub(1)), 0)
}).merge(lambda d: r.do(
r.div(d['sum'], d['count']), # avg
r.branch(d['s2'].eq(0), 0, r.do( d['s2'], r.js('(function(x){ return Math.sqrt(x) })') )), # sample standard deviation
lambda avg, s: {
'avg': avg,
's': s,
'CV': r.branch(avg.eq(0), 0, r.div(s, avg))
}))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment