Skip to content

Instantly share code, notes, and snippets.

@zmsmith
Forked from Bpless/gist:1771930
Last active August 29, 2015 13:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zmsmith/9003143 to your computer and use it in GitHub Desktop.
Save zmsmith/9003143 to your computer and use it in GitHub Desktop.
from mongoengine.base import TopLevelDocumentMetaclass
class CompressedKeyDocumentMetaclass(TopLevelDocumentMetaclass):
def __new__(cls, name, bases, attrs):
"""
MongoEngine Document Classes access the 'TopLevelDocumentMetaclass'
__metaclass__. We allow that metaclass to set attrs on the class
and then compress the fields in the event that the instantiated
Document Class contains the meta attr 'compress_keys'
That is not the most efficient flow. Going forward, we should
either fork MongoEngine and insert this logic directly into
the TopLevelDocumentMetaclass
OR
process attrs before instantiating the new_class
"""
new_class = super(CompressedKeyDocumentMetaclass, cls).__new__(cls, name, bases, attrs)
if ('meta' in attrs and attrs['meta'].get('compress_keys', False)):
if hasattr(new_class, '_fields'):
key_mapping = new_class._map_fields()
# HANDLE INDEX CREATION HERE by resetting cls._meta['indexes]
if new_class._meta.get('indexes'):
for index in new_class._meta.get('indexes'):
fields = index['fields']
i_list = []
for f in fields:
raw_field_name = f[0]
compressed_name = key_mapping[raw_field_name]
direction = f[1]
i_list.append((compressed_name, direction))
index['fields'] = i_list
return new_class
@property
def _mapping_collection(cls):
"""
Connects to (or produces on first lookup) a mapping collection
whose name is created by appending '_mapping' to the MongoEngine class
"""
collection_name = '%s_mapping' % cls._get_collection_name()
return getattr(connection, collection_name)
def _is_embedded_field(cls, field):
"""
Checks whether a given field is an EmbeddedObject
"""
return hasattr(field, 'field') and getattr(field, 'field') is not None
def _field_name_set(cls, subfield=None):
"""
Returns a set of all field names within that nested level
If field is embedded, this method returns nested level field names
"""
if not subfield:
fields = cls._fields.values()
else:
fields = subfield.field.document_type._fields.values()
return set(f.name for f in fields)
def _set_fields(cls, fields, collection=None, document=None):
"""
Set mapped collection values here. Handles all fields the first
time a class is evaluated. Subsequently, handles only changed fields
Records the uncompressed field name, the compressed field name,
and the datetime at which a field is added to the class
Compressed names represent the minimum unique, sequential slices
of a full string.
'test' --> 't'
'trial' --> 'tr'
We could avoid multiple chars here in a variety of ways;
Advantage of this route is that compressed field name more clearly
relate to uncompressed names.
Logic in the range iterator attempts to handle collisions such as:
'rechandler' --> 'r'
'recluse' --> 're'
'recsize' --> 'rec'
'rec' -->
May not be necessary (or elegant).
Embedded fields are handled recursively. May be possible to compress
directly on EmbeddedObject class but it was not working for me.
Should revisit that possibility.
TODO: Handle setting of embedded fields who parent field has not changed.
"""
import random
old_fields = dict((k, v) for k, v in document.items()) if document else {}
old_fields_name_set = set(f.get('db_key') for f in old_fields.values())
new_fields_dict = {}
for f in fields:
f_len = len(f.db_field)
if f.db_field not in ('_id', '_cls', '_types'):
# Avoid edge case where substrings collide
for i in xrange(f_len + 5):
packed_name = f.db_field[:i + 1]
if not old_fields_name_set or packed_name not in old_fields_name_set:
new_fields_dict[f.name] = {'db_key': packed_name, 'set': datetime.datetime.now()}
old_fields_name_set.add(packed_name)
f.db_field = packed_name
break
if i > f_len:
# Check if value has been set successfully, otherwise append a random digit
f.db_field = '%s_%d' % (packed_name, + random.randrange(1, 10))
else:
new_fields_dict[f.db_field] = {'db_key': f.db_field, 'set': datetime.datetime.now()}
# Handle Embedded Documents recursively
if cls._is_embedded_field(f):
embedded_fields = cls._set_fields(f.field.document_type._fields.values(), document=document)
embed_dict = {}
for embed in embedded_fields:
embed_dict[embed.name] = {'db_key': embed.db_field, 'set': datetime.datetime.now()}
new_fields_dict[f.name].update({'embedded_fields': embed_dict})
if collection:
if document:
obj = {'%s.db_key' % old_fields.items()[0][0]: old_fields.items()[0][1].get('db_key')}
collection.update(obj, {'$set': new_fields_dict})
else:
collection.save(new_fields_dict)
return new_fields_dict
else:
return fields
def _unset_fields(cls, collection, field_key, field_value, document, embedded_key=None, embedded_key_packed=None):
"""
Unsets mapped fields by looking up the appropriate key in the mapped
collection document and adding an "unset" attribute (refactor naming here to avoid modifier confusion).
This marks the datetime that the field was inactivated, rather than deleting the field.
If you were to delete the field, new compacted names could conflict with existing documents
in the collection. Embedded fields are unset as well.
"""
if field_key not in ('_id', '_cls', '_types'):
if not embedded_key:
old_doc = {'%s.db_key' % (field_key): field_value}
new_doc = {'%s.unset' % field_key: datetime.datetime.now()}
collection.update(old_doc, {'$set': new_doc})
else:
old_doc = {'%s.embedded_fields.%s.db_key' % (field_key, embedded_key): embedded_key_packed}
new_doc = {'%s.embedded_fields.%s.unset' % (field_key, embedded_key): datetime.datetime.now()}
collection.update(old_doc, {'$set': new_doc})
def _pack_field(cls, field, dict_key, dict_value):
if dict_key == field.name:
field.db_field = dict_value.get('db_key')
return field.db_field
def _map_fields(cls):
collection = cls._mapping_collection
meta_keys_doc = collection.find_one()
cls_fields = cls._fields.values()
cls_field_set = cls._field_name_set()
if not meta_keys_doc:
meta_keys_doc = cls._set_fields(cls_fields, collection=collection)
else:
new_fields = [f for f in cls_fields \
if (f.name not in meta_keys_doc.keys()
and f.name is not None)]
if new_fields:
fields_dict = cls._set_fields(new_fields, collection=collection, document=meta_keys_doc)
meta_keys_doc.update(fields_dict)
key_mapping = {}
for field_key, field_value in meta_keys_doc.items():
# Unset inactive top level fields
if not field_key in cls_field_set and not meta_keys_doc[field_key].get('unset'):
cls._unset_fields(collection, field_key, field_value['db_key'], meta_keys_doc)
else:
for cf in cls_fields:
# Unset inactive embedded fields
if cls._is_embedded_field(cf):
for k, v in meta_keys_doc[cf.name]['embedded_fields'].items():
embed_field_set = cls._field_name_set(cf)
if not v.get('unset') and k not in embed_field_set:
cls._unset_fields(collection, cf.name, cf.db_field, meta_keys_doc, embedded_key=k, embedded_key_packed=v.get('db_key'))
if field_key == cf.name:
# Map all active field names within the class obj to compacted names
# Happens everytime as opposed to the _set_fields method
key_mapping[field_key] = cls._pack_field(cf, field_key, field_value)
if cls._is_embedded_field(cf):
for f in cf.field.document_type._fields.values():
sub_key = field_value.get('embedded_fields').get(f.name)
if sub_key:
f.db_field = sub_key.get('db_key')
return key_mapping
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment