Skip to content

Instantly share code, notes, and snippets.

@hiropppe
Last active October 27, 2015 14:30
Show Gist options
  • Save hiropppe/ce5552b03a17e9e47173 to your computer and use it in GitHub Desktop.
Save hiropppe/ce5552b03a17e9e47173 to your computer and use it in GitHub Desktop.
import pymongo
from bson.dbref import DBRef
from collections import OrderedDict
from random import randrange, choice
client = pymongo.MongoClient()
test = client.test
spot = test.spot
play = test.play
# 1:1
spot.insert_many([{'name': 'spot_{}'.format(i), 'seq': i} for i in xrange(0,100000)])
# Query JOIN
def insert_children_as_objectid():
for r in spot.find():
play.insert_many([{'name': 'play_{}'.format(i), 'spot': r['_id'], 'seq': i} for i in xrange(0,10)])
%time insert_children_as_objectid()
CPU times: user 45.4 s, sys: 6.58 s, total: 52 s
Wall time: 2min 32s
play.create_index("spot")
play.create_index("seq")
def join_by_query(seq):
for r in play.find({'seq': seq}).limit(100):
r['spot'] = spot.find_one({'_id': r['spot']})
def join_by_query2(seq):
od = OrderedDict()
spot_ids = []
for r in play.find({'seq':seq}).limit(100):
od[r['_id']] = r
spot_ids.append(r['spot'])
spot_data = {}
for s in spot.find({'_id': {'$in': spot_ids}}):
spot_data[s['_id']] = s
for value in od.values():
value['spot'] = spot_data[value['spot']]
%timeit -n100 join_by_query(randrange(0,10))
100 loops, best of 3: 25.1 ms per loop
%timeit -n100 join_by_query2(randrange(0,10))
100 loops, best of 3: 2.78 ms per loop
# DBRef JOIN
play.remove()
def insert_children_as_dbref():
for r in spot.find():
play.insert_many([{'name': 'play_{}'.format(i), 'spot': DBRef(collection='spot', id=r['_id']), 'seq': i} for i in xrange(0,10)])
%time insert_children_as_dbref()
CPU times: user 1min 3s, sys: 12 s, total: 1min 15s
Wall time: 2min 37s
def join_by_dbref(seq):
for r in play.find({'seq': seq}).limit(100):
r['spot'] = test.dereference(r['spot'])
%timeit -n100 join_by_dbref(randrange(0,10))
100 loops, best of 3: 27.8 ms per loop
# 1:N
spot.remove()
spot.insert_many([{'name': 'spot_{}'.format(i), 'seq': i} for i in xrange(0,10000)])
# Query JOIN
play.remove()
play.insert_many([{'name': 'play_{}'.format(i), 'seq': i} for i in xrange(0,100000)])
def save_children_as_objectid_array():
for p in play.find(no_cursor_timeout=True).batch_size(100):
spots = [s['_id'] for s in spot.find({'seq': {'$in': [randrange(0, choice([10,100,1000,10000])) for i in xrange(0,5)]}})]
play.update({'_id': p['_id']}, {'$set': {'spot': spots}})
%time save_children_as_objectid_array()
CPU times: user 1min 4s, sys: 12.7 s, total: 1min 17s
Wall time: 12min 23s
def join_by_in_query(num_row):
for r in play.find({'seq': {'$in': [randrange(0, choice([10,100,1000,10000])) for i in xrange(0,num_row)]}}):
r['spot'] = [s for s in spot.find({'_id': {'$in': r['spot']}})]
def join_by_in_query2(num_row):
od = OrderedDict()
spot_ids = []
for r in play.find({'seq': {'$in': [randrange(0, choice([10,100,1000,10000])) for i in xrange(0,num_row)]}}):
od[r['_id']] = r
spot_ids.extend([s for s in r['spot']])
spot_ids = set(spot_ids)
spot_data = {}
for s in spot.find({'_id': {'$in': list(spot_ids)}}):
spot_data[s['_id']] = s
for value in od.values():
value['spot'] = [spot_data[s] for s in value['spot']]
%timeit -n100 join_by_in_query(100)
100 loops, best of 3: 28.3 ms per loop
%timeit -n100 join_by_in_query2(100)
100 loops, best of 3: 6.62 ms per loop
# DBRef JOIN
play.remove()
play.insert_many([{'name': 'play_{}'.format(i), 'seq': i} for i in xrange(0,100000)])
def save_children_as_dbref_array():
for p in play.find(no_cursor_timeout=True).batch_size(100):
spots = [DBRef(collection='spot', id=s['_id']) for s in spot.find({'seq': {'$in': [randrange(0, choice([10,100,1000,10000])) for i in xrange(0,3)]}})]
play.update({'_id': p['_id']}, {'$set': {'spot': spots}})
%time save_children_as_dbref_array()
CPU times: user 1min 5s, sys: 13.9 s, total: 1min 19s
Wall time: 11min 55s
def join_by_in_dbref(num_row):
for r in play.find({'seq': {'$in': [randrange(0, choice([10,100,1000,10000])) for i in xrange(0,num_row)]}}):
r['spot'] = [test.dereference(s) for s in r['spot']]
%timeit -n100 join_by_in_dbref(100)
100 loops, best of 3: 6.15 ms per loop
# map_reduct
'''
from bson.code import Code
m = Code("""
function() {
emit(this._id, this.spot);
}
""")
r = Code("""
function (key, values) {
var spot = db.getSiblingDB('test')['spot']
var result = [];
values.forEach(function(value){
result.push(spot.findOne())
});
return result;
}
""")
result = play.map_reduce(m, r, "myresult", query={"seq": {'$in': [0,1,2]}})
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment