zmsmith/gist:9003143

## gistfile1.py
from mongoengine.base import TopLevelDocumentMetaclass

class CompressedKeyDocumentMetaclass(TopLevelDocumentMetaclass):

    def __new__(cls, name, bases, attrs):
        """
        MongoEngine Document Classes access the 'TopLevelDocumentMetaclass'
        __metaclass__. We allow that metaclass to set attrs on the class
        and then compress the fields in the event that the instantiated
        Document Class contains the meta attr 'compress_keys'

        That is not the most efficient flow.  Going forward, we should
        either fork MongoEngine and insert this logic directly into
        the TopLevelDocumentMetaclass
        OR
        process attrs before instantiating the new_class
        """

        new_class = super(CompressedKeyDocumentMetaclass, cls).__new__(cls, name, bases, attrs)
        if ('meta' in attrs and attrs['meta'].get('compress_keys', False)):
            if hasattr(new_class, '_fields'):
                key_mapping = new_class._map_fields()
            # HANDLE INDEX CREATION HERE by resetting cls._meta['indexes]
            if new_class._meta.get('indexes'):
                for index in new_class._meta.get('indexes'):
                    fields = index['fields']
                    i_list = []
                    for f in fields:
                        raw_field_name = f[0]
                        compressed_name = key_mapping[raw_field_name]
                        direction = f[1]
                        i_list.append((compressed_name, direction))
                    index['fields'] = i_list
        return new_class

    @property
    def _mapping_collection(cls):
        """
        Connects to (or produces on first lookup) a mapping collection
        whose name is created by appending '_mapping' to the MongoEngine class
        """
        collection_name = '%s_mapping' % cls._get_collection_name()
        return getattr(connection, collection_name)

    def _is_embedded_field(cls, field):
        """
        Checks whether a given field is an EmbeddedObject
        """
        return hasattr(field, 'field') and getattr(field, 'field') is not None

    def _field_name_set(cls, subfield=None):
        """
        Returns a set of all field names within that nested level
        If field is embedded, this method returns nested level field names
        """
        if not subfield:
            fields = cls._fields.values()
        else:
            fields = subfield.field.document_type._fields.values()
        return set(f.name for f in fields)

    def _set_fields(cls, fields, collection=None, document=None):
        """
        Set mapped collection values here. Handles all fields the first
        time a class is evaluated. Subsequently, handles only changed fields

        Records the uncompressed field name, the compressed field name,
        and the datetime at which a field is added to the class

        Compressed names represent the minimum unique, sequential slices
        of a full string.
            'test' --> 't'
            'trial' --> 'tr'
        We could avoid multiple chars here in a variety of ways;
        Advantage of this route is that compressed field name more clearly
        relate to uncompressed names.

        Logic in the range iterator attempts to handle collisions such as:
            'rechandler' --> 'r'
            'recluse' --> 're'
            'recsize' --> 'rec'
            'rec' -->
        May not be necessary (or elegant).

        Embedded fields are handled recursively. May be possible to compress
        directly on EmbeddedObject class but it was not working for me.
        Should revisit that possibility.

        TODO: Handle setting of embedded fields who parent field has not changed.
        """
        import random

        old_fields = dict((k, v) for k, v in document.items()) if document else {}
        old_fields_name_set = set(f.get('db_key') for f in old_fields.values())
        new_fields_dict = {}
        for f in fields:
            f_len = len(f.db_field)
            if f.db_field not in ('_id', '_cls', '_types'):
                # Avoid edge case where substrings collide
                for i in xrange(f_len + 5):
                    packed_name = f.db_field[:i + 1]
                    if not old_fields_name_set or packed_name not in old_fields_name_set:
                        new_fields_dict[f.name] = {'db_key': packed_name, 'set': datetime.datetime.now()}
                        old_fields_name_set.add(packed_name)
                        f.db_field = packed_name
                        break
                    if i > f_len:
                        # Check if value has been set successfully, otherwise append a random digit
                        f.db_field = '%s_%d' % (packed_name, + random.randrange(1, 10))
            else:
                new_fields_dict[f.db_field] = {'db_key': f.db_field, 'set': datetime.datetime.now()}
            # Handle Embedded Documents recursively
            if cls._is_embedded_field(f):
                embedded_fields = cls._set_fields(f.field.document_type._fields.values(), document=document)
                embed_dict = {}
                for embed in embedded_fields:
                    embed_dict[embed.name] = {'db_key': embed.db_field, 'set': datetime.datetime.now()}
                new_fields_dict[f.name].update({'embedded_fields': embed_dict})

        if collection:
            if document:
                obj = {'%s.db_key' % old_fields.items()[0][0]: old_fields.items()[0][1].get('db_key')}
                collection.update(obj, {'$set': new_fields_dict})
            else:
                collection.save(new_fields_dict)
            return new_fields_dict
        else:
            return fields

    def _unset_fields(cls, collection, field_key, field_value, document, embedded_key=None, embedded_key_packed=None):
        """
        Unsets mapped fields by looking up the appropriate key in the mapped
        collection document and adding an "unset" attribute (refactor naming here to avoid modifier confusion).

        This marks the datetime that the field was inactivated, rather than deleting the field.
        If you were to delete the field, new compacted names could conflict with existing documents
        in the collection.  Embedded fields are unset as well.
        """
        if field_key not in ('_id', '_cls', '_types'):
            if not embedded_key:
                old_doc = {'%s.db_key' % (field_key): field_value}
                new_doc = {'%s.unset' % field_key: datetime.datetime.now()}
                collection.update(old_doc, {'$set': new_doc})
            else:
                old_doc = {'%s.embedded_fields.%s.db_key' % (field_key, embedded_key): embedded_key_packed}
                new_doc = {'%s.embedded_fields.%s.unset' % (field_key, embedded_key): datetime.datetime.now()}
                collection.update(old_doc, {'$set': new_doc})

    def _pack_field(cls, field, dict_key, dict_value):
        if dict_key == field.name:
            field.db_field = dict_value.get('db_key')
        return field.db_field

    def _map_fields(cls):
        collection = cls._mapping_collection
        meta_keys_doc = collection.find_one()
        cls_fields = cls._fields.values()
        cls_field_set = cls._field_name_set()

        if not meta_keys_doc:
            meta_keys_doc = cls._set_fields(cls_fields, collection=collection)
        else:
            new_fields = [f for f in cls_fields \
                if (f.name not in meta_keys_doc.keys()
                and f.name is not None)]
            if new_fields:
                fields_dict = cls._set_fields(new_fields, collection=collection, document=meta_keys_doc)
                meta_keys_doc.update(fields_dict)

        key_mapping = {}
        for field_key, field_value in meta_keys_doc.items():
            # Unset inactive top level fields
            if not field_key in cls_field_set and not meta_keys_doc[field_key].get('unset'):
                cls._unset_fields(collection, field_key, field_value['db_key'], meta_keys_doc)
            else:
                for cf in cls_fields:
                    # Unset inactive embedded fields
                    if cls._is_embedded_field(cf):
                        for k, v in meta_keys_doc[cf.name]['embedded_fields'].items():
                            embed_field_set = cls._field_name_set(cf)
                            if not v.get('unset') and k not in embed_field_set:
                                cls._unset_fields(collection, cf.name, cf.db_field, meta_keys_doc, embedded_key=k, embedded_key_packed=v.get('db_key'))

                    if field_key == cf.name:
                        # Map all active field names within the class obj to compacted names
                        # Happens everytime as opposed to the _set_fields method
                        key_mapping[field_key] = cls._pack_field(cf, field_key, field_value)
                        if cls._is_embedded_field(cf):
                            for f in cf.field.document_type._fields.values():
                                sub_key = field_value.get('embedded_fields').get(f.name)
                                if sub_key:
                                    f.db_field = sub_key.get('db_key')
        return key_mapping
	from mongoengine.base import TopLevelDocumentMetaclass

	class CompressedKeyDocumentMetaclass(TopLevelDocumentMetaclass):

	def __new__(cls, name, bases, attrs):
	"""
	MongoEngine Document Classes access the 'TopLevelDocumentMetaclass'
	__metaclass__. We allow that metaclass to set attrs on the class
	and then compress the fields in the event that the instantiated
	Document Class contains the meta attr 'compress_keys'

	That is not the most efficient flow. Going forward, we should
	either fork MongoEngine and insert this logic directly into
	the TopLevelDocumentMetaclass
	OR
	process attrs before instantiating the new_class
	"""

	new_class = super(CompressedKeyDocumentMetaclass, cls).__new__(cls, name, bases, attrs)
	if ('meta' in attrs and attrs['meta'].get('compress_keys', False)):
	if hasattr(new_class, '_fields'):
	key_mapping = new_class._map_fields()
	# HANDLE INDEX CREATION HERE by resetting cls._meta['indexes]
	if new_class._meta.get('indexes'):
	for index in new_class._meta.get('indexes'):
	fields = index['fields']
	i_list = []
	for f in fields:
	raw_field_name = f[0]
	compressed_name = key_mapping[raw_field_name]
	direction = f[1]
	i_list.append((compressed_name, direction))
	index['fields'] = i_list
	return new_class

	@property
	def _mapping_collection(cls):
	"""
	Connects to (or produces on first lookup) a mapping collection
	whose name is created by appending '_mapping' to the MongoEngine class
	"""
	collection_name = '%s_mapping' % cls._get_collection_name()
	return getattr(connection, collection_name)

	def _is_embedded_field(cls, field):
	"""
	Checks whether a given field is an EmbeddedObject
	"""
	return hasattr(field, 'field') and getattr(field, 'field') is not None

	def _field_name_set(cls, subfield=None):
	"""
	Returns a set of all field names within that nested level
	If field is embedded, this method returns nested level field names
	"""
	if not subfield:
	fields = cls._fields.values()
	else:
	fields = subfield.field.document_type._fields.values()
	return set(f.name for f in fields)

	def _set_fields(cls, fields, collection=None, document=None):
	"""
	Set mapped collection values here. Handles all fields the first
	time a class is evaluated. Subsequently, handles only changed fields

	Records the uncompressed field name, the compressed field name,
	and the datetime at which a field is added to the class

	Compressed names represent the minimum unique, sequential slices
	of a full string.
	'test' --> 't'
	'trial' --> 'tr'
	We could avoid multiple chars here in a variety of ways;
	Advantage of this route is that compressed field name more clearly
	relate to uncompressed names.

	Logic in the range iterator attempts to handle collisions such as:
	'rechandler' --> 'r'
	'recluse' --> 're'
	'recsize' --> 'rec'
	'rec' -->
	May not be necessary (or elegant).

	Embedded fields are handled recursively. May be possible to compress
	directly on EmbeddedObject class but it was not working for me.
	Should revisit that possibility.

	TODO: Handle setting of embedded fields who parent field has not changed.
	"""
	import random

	old_fields = dict((k, v) for k, v in document.items()) if document else {}
	old_fields_name_set = set(f.get('db_key') for f in old_fields.values())
	new_fields_dict = {}
	for f in fields:
	f_len = len(f.db_field)
	if f.db_field not in ('_id', '_cls', '_types'):
	# Avoid edge case where substrings collide
	for i in xrange(f_len + 5):
	packed_name = f.db_field[:i + 1]
	if not old_fields_name_set or packed_name not in old_fields_name_set:
	new_fields_dict[f.name] = {'db_key': packed_name, 'set': datetime.datetime.now()}
	old_fields_name_set.add(packed_name)
	f.db_field = packed_name
	break
	if i > f_len:
	# Check if value has been set successfully, otherwise append a random digit
	f.db_field = '%s_%d' % (packed_name, + random.randrange(1, 10))
	else:
	new_fields_dict[f.db_field] = {'db_key': f.db_field, 'set': datetime.datetime.now()}
	# Handle Embedded Documents recursively
	if cls._is_embedded_field(f):
	embedded_fields = cls._set_fields(f.field.document_type._fields.values(), document=document)
	embed_dict = {}
	for embed in embedded_fields:
	embed_dict[embed.name] = {'db_key': embed.db_field, 'set': datetime.datetime.now()}
	new_fields_dict[f.name].update({'embedded_fields': embed_dict})

	if collection:
	if document:
	obj = {'%s.db_key' % old_fields.items()[0][0]: old_fields.items()[0][1].get('db_key')}
	collection.update(obj, {'$set': new_fields_dict})
	else:
	collection.save(new_fields_dict)
	return new_fields_dict
	else:
	return fields

	def _unset_fields(cls, collection, field_key, field_value, document, embedded_key=None, embedded_key_packed=None):
	"""
	Unsets mapped fields by looking up the appropriate key in the mapped
	collection document and adding an "unset" attribute (refactor naming here to avoid modifier confusion).

	This marks the datetime that the field was inactivated, rather than deleting the field.
	If you were to delete the field, new compacted names could conflict with existing documents
	in the collection. Embedded fields are unset as well.
	"""
	if field_key not in ('_id', '_cls', '_types'):
	if not embedded_key:
	old_doc = {'%s.db_key' % (field_key): field_value}
	new_doc = {'%s.unset' % field_key: datetime.datetime.now()}
	collection.update(old_doc, {'$set': new_doc})
	else:
	old_doc = {'%s.embedded_fields.%s.db_key' % (field_key, embedded_key): embedded_key_packed}
	new_doc = {'%s.embedded_fields.%s.unset' % (field_key, embedded_key): datetime.datetime.now()}
	collection.update(old_doc, {'$set': new_doc})

	def _pack_field(cls, field, dict_key, dict_value):
	if dict_key == field.name:
	field.db_field = dict_value.get('db_key')
	return field.db_field

	def _map_fields(cls):
	collection = cls._mapping_collection
	meta_keys_doc = collection.find_one()
	cls_fields = cls._fields.values()
	cls_field_set = cls._field_name_set()

	if not meta_keys_doc:
	meta_keys_doc = cls._set_fields(cls_fields, collection=collection)
	else:
	new_fields = [f for f in cls_fields \
	if (f.name not in meta_keys_doc.keys()
	and f.name is not None)]
	if new_fields:
	fields_dict = cls._set_fields(new_fields, collection=collection, document=meta_keys_doc)
	meta_keys_doc.update(fields_dict)

	key_mapping = {}
	for field_key, field_value in meta_keys_doc.items():
	# Unset inactive top level fields
	if not field_key in cls_field_set and not meta_keys_doc[field_key].get('unset'):
	cls._unset_fields(collection, field_key, field_value['db_key'], meta_keys_doc)
	else:
	for cf in cls_fields:
	# Unset inactive embedded fields
	if cls._is_embedded_field(cf):
	for k, v in meta_keys_doc[cf.name]['embedded_fields'].items():
	embed_field_set = cls._field_name_set(cf)
	if not v.get('unset') and k not in embed_field_set:
	cls._unset_fields(collection, cf.name, cf.db_field, meta_keys_doc, embedded_key=k, embedded_key_packed=v.get('db_key'))

	if field_key == cf.name:
	# Map all active field names within the class obj to compacted names
	# Happens everytime as opposed to the _set_fields method
	key_mapping[field_key] = cls._pack_field(cf, field_key, field_value)
	if cls._is_embedded_field(cf):
	for f in cf.field.document_type._fields.values():
	sub_key = field_value.get('embedded_fields').get(f.name)
	if sub_key:
	f.db_field = sub_key.get('db_key')
	return key_mapping