Skip to content

Instantly share code, notes, and snippets.

@chrisklaiber
Last active September 16, 2016 22:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save chrisklaiber/46b24723d93182e3f81068572566b18a to your computer and use it in GitHub Desktop.
Save chrisklaiber/46b24723d93182e3f81068572566b18a to your computer and use it in GitHub Desktop.
namedmap and tuplemap: quacks like a dict, has the memory footprint of a tuple (eh, good enough)
"""A space-efficient frozen (write-once) mapping class.
A named map has two key properties:
1. The interface is like a dict.
2. Memory usage is like an object that defines __slots__.
Call namedmap.frozennamedmap() to define a subclass, passing the name
of the type and an iterable of key names.
>>> Point = namedmap.frozennamedmap('Point', ('x', 'y'))
>>> Point.__doc__
'Point(x, y)'
>>> origin = Point({'x': 0, 'y': 0})
>>> origin['x']
0
>>> partial = Point({'y': 1})
>>> partial['y']
1
>>> partial['x']
KeyError: 'x'
This new data structure was motivated by a want to reduce the
in-memory size of Khan Academy's content database, which contains tens
of thousands of dicts. Read all about it here:
http://engineering.khanacademy.org/posts/evolving-our-content-infrastructure.htm
When the keys and values are interned and heavily shared, like they
are in the frozen model store, the overhead of the container can be
significant. A dict (280 bytes) is much larger than an object with
__slots__ (120 bytes with one property, +8 bytes per property).
A named map keeps the smaller in-memory representation of an object
with __slots__ while serialization and access behave more like a dict.
From July 5, 2016, here is data from switching the 'item' key's list
of Khan Academy's FrozenExercise.problem_types to other data types:
dict in-memory:43465992 pickled:9503491 zipped:3922128
namedtuple in-memory:16743448 pickled:5496541 zipped:2945814
namedmap in-memory:16181984 pickled:6276168 zipped:2996040
namedmap can be more compact than namedtuple because namedtuple
requires that all values be specified, even if set to None, whereas
namedmap's keys are optional.
"""
import collections
import sys
def _extend_with_mapping_definitions(cls):
"""Leverage collections.Mapping's read-only mapping definitions.
Why not use inheritance? collections.Mapping doesn't use __slots__
and so inheriting from it creates a __dict__ on instances, which
is what the use of __slots__ is trying to avoid!
We use this as a class decorator to mix in the collections.Mapping
definitions. Existing attributes are not overridden.
"""
combined_dict = collections.Mapping.__dict__.copy()
combined_dict.update(cls.__dict__)
# Kind of hacky: a Python class's __dict__ is a read-only
# dictproxy (as opposed to a class instance's __dict__, which
# won't exist in this case because we're using __slots__). This
# redefinition is how we modify the class's dictproxy.
return type(cls.__name__, cls.__bases__, combined_dict)
@_extend_with_mapping_definitions
class _FrozenNamedMap(object):
__slots__ = () # subclass must define __slots__
def __new__(cls, *args, **kwargs):
# We enforce the same creation interface as dict(), and we
# further restrict to only known field names.
argdict = dict(*args, **kwargs)
result = object.__new__(cls)
for key in cls.__slots__:
if key in argdict:
setattr(result, key, argdict.pop(key))
if argdict:
raise TypeError('%s unexpected keys: %r'
% (cls.__name__, argdict.keys()))
return result
def __repr__(self):
return '%s({%s})' % (
self.__class__.__name__,
', '.join('%r: %r' % kv_pair for kv_pair in self.iteritems()))
def __reduce__(self):
"""Return self as a plain tuple of key-value pairs. Used by pickle."""
# We could have more compact pickled representation but it's a
# smallish savings and the cost is "unfreezing" this structure
# by implementing __setitem__. To understand why, see pickle's
# docs for __reduce__.
#
# The more compact implementation is:
# return (self.__class__, (), None, None, self.iteritems())
#
# Here are pickled sizes of ~8k FrozenExercise.problem_types
# converted to namedtuple and namedmap, as implemented below
# (baseline) and with the above modification (improved):
#
# pickled zipped %-of-baseline
# namedtuple 5495485 2945178 87.6%
# namedmap baseline 6275110 2995217 100.0%
# namedmap improved 6047125 2966671 96.4%
# __reduce__ provides fine-grained pickle control. The first
# element of the result is a callable, our class. The second
# element is a tuple of arguments for the callable object.
# For a map with keys 'x', 'y' this returns:
# (self.__class__, ((('x', 1), ('y', 2)),))
# which, in depickling, is the call:
# self.__class__((('x', 1), ('y', 2)))
arg1 = tuple(self.iteritems())
return (self.__class__, (arg1,))
# The definitions from collections.Mapping require __len__,
# __iter__, and __getitem__.
def __len__(self):
n = 0
for _ in self:
n += 1
return n
def __iter__(self):
# Callers must not rely on the order of iteration. This is
# intended as a substitute for dict.
default = object()
for key in self.__slots__:
if getattr(self, key, default) is not default:
yield key
def __getitem__(self, key):
if not isinstance(key, str):
raise TypeError('%s keys must be strings, not %s'
% (self.__class__.__name__,
key.__class__.__name__))
try:
return getattr(self, key)
except AttributeError:
raise KeyError(key)
# We may someday want to implement __setitem__ and __delitem__ and
# use definitions from collections.MutableMapping. Maybe in a
# subclass called _NamedMap, or maybe we invert the hierarchy to
# have the frozen version be the subclass. For now, though, it's
# kind of nice to be immutable.
def __setitem__(self, key, value):
raise NotImplementedError()
def __delitem__(self, key):
raise NotImplementedError()
def frozennamedmap(typename, keys):
"""Return a new immutable namedmap with named keys."""
keys = tuple(keys)
for key in keys:
assert isinstance(key, str), key
typedict = {
'__slots__': keys,
'__doc__': '%s(%s)' % (typename, ', '.join(keys)),
}
result = type(typename, (_FrozenNamedMap,), typedict)
# Setting __module__ is taken from collection.py in namedtuple().
# For pickling to work, the __module__ variable needs to be set to
# the frame where the named tuple is created. Bypass this step in
# environments where sys._getframe is not defined (Jython for
# example) or sys._getframe is not defined for arguments greater
# than 0 (IronPython).
try:
result.__module__ = sys._getframe(1).f_globals.get('__name__',
'__main__')
except (AttributeError, ValueError):
pass
return result
import json
import pickle
import unittest
from pympler import asizeof
import namedmap
Point = namedmap.frozennamedmap('Point', ('x', 'y'))
class TestNamedMap(unittest.TestCase):
def _assert_x1y2_point(self, pt):
self.assertEqual(pt, {'x': 1, 'y': 2})
self.assertEqual(pt['x'], 1)
self.assertEqual(pt['y'], 2)
with self.assertRaises(KeyError):
pt['z']
self.assertIn('x', pt)
self.assertNotIn('z', pt)
self.assertEqual(pt.get('x'), 1)
self.assertEqual(pt.get('y'), 2)
self.assertIsNone(pt.get('z'))
self.assertEqual(dict(pt.items()), {'x': 1, 'y': 2})
self.assertEqual(pt.values(), [1, 2])
self.assertEqual(pt.keys(), ['x', 'y'])
self.assertEqual(dict(pt.iteritems()), {'x': 1, 'y': 2})
self.assertEqual(sorted(pt.itervalues()), [1, 2])
self.assertEqual(sorted(pt.iterkeys()), ['x', 'y'])
self.assertEqual(len(pt), 2)
def test_interface(self):
self.assertEqual(Point(), {})
self.assertEqual(Point(x=1), {'x': 1})
pt = Point(x=1, y=2)
self._assert_x1y2_point(pt)
# Check __slots__-object-like behaviors.
self.assertEqual(pt.x, 1)
self.assertEqual(pt.y, 2)
with self.assertRaises(AttributeError):
_ = Point().x
# Check for unexpected keys.
with self.assertRaises(TypeError):
Point(z=3)
# Ensure frozenness.
with self.assertRaises(NotImplementedError):
pt['x'] = 1
with self.assertRaises(NotImplementedError):
del pt['x']
# Ensure that two instances that should have different
# lengths, do have different lengths. This would have detected
# an implementation bug where caching meant the same value was
# returned for all instances.
partial_pt = Point(x=1)
self.assertEqual(len(pt), 2)
self.assertEqual(len(partial_pt), 1)
def test_pickling(self):
pt = Point(x=1, y=2)
# Ensure that all protocols work (this errored back when
# namedmap implemented __getnewargs__ rather than __reduce__).
for protocol in xrange(pickle.HIGHEST_PROTOCOL + 1):
pt_again = pickle.loads(pickle.dumps(pt, protocol))
self._assert_x1y2_point(pt_again)
# Test pickling when only some keys are set.
partial_pt = pickle.loads(pickle.dumps(Point(x=1)))
self.assertEqual(partial_pt['x'], 1)
self.assertNotIn('y', partial_pt)
partial_pt = pickle.loads(pickle.dumps(Point(y=2)))
self.assertEqual(partial_pt['y'], 2)
self.assertNotIn('x', partial_pt)
def test_constructor(self):
pt = Point(y=2, x=1)
self.assertEqual(pt['x'], 1)
self.assertEqual(pt['y'], 2)
self.assertEqual(Point([('x', 1)])['x'], 1)
self.assertEqual(Point({'x': 1})['x'], 1)
self.assertEqual(Point([('x', 1)], x=0)['x'], 0)
def test_class_structure(self):
self.assertEqual(Point().__slots__, ('x', 'y'))
# Ensure we use __slots__ and don't incur __dict__ overhead.
with self.assertRaises(AttributeError):
Point().__dict__
def test_memory_size(self):
# Roughly test the core purpose of tuple map, in-memory size.
solar_system_distances = dict((('Mercury', 0.39), ('Venus', 0.723),
('Earth', 1.0), ('Mars', 1.524),
('Jupiter', 5.203), ('Saturn', 9.539),
('Uranus', 19.18), ('Neptune', 30.06),
('Pluto', 39.53)))
myclass = namedmap.frozennamedmap('myclass',
tuple(solar_system_distances))
asdicts = [dict(**solar_system_distances) for _ in xrange(1000)]
asnamedmaps = [myclass(**solar_system_distances) for _ in xrange(1000)]
# On chris' Macbook Pro dicts are ~1MB and namedmaps ~130KB.
self.assertGreater(asizeof.asizeof(asdicts), 950 * 1000)
self.assertLess(asizeof.asizeof(asnamedmaps), 150 * 1000)
def test_json(self):
# The json module from the standard library doesn't know how
# to serialize custom object.
with self.assertRaises(TypeError):
json.dumps(Point(x=1, y=2))
# NOTE: This library is obsoleted by namedmap, which should be used
# instead. The reason is that a tuplemap has serious serialization
# problems with Python's built-in json library (it serializes as a list,
# not an object). Furthermore, due to a bug in pympler.asizeof, a the
# space-saving benefits of a tuplemap had been exaggerated.
"""Defines a dict-like structure with a compact in-memory representation.
Call tuplemap.frozentuplemap() to define a subclass, passing the name
of the type and an iterable of field names:
>>> Point = tuplemap.frozentuplemap('Point', ('x', 'y'))
>>> Point.__doc__
'Point(x, y)'
>>> origin = Point(x=0, y=0)
>>> origin['x']
0
>>> partial = Point(y=1)
>>> partial['y']
1
>>> partial['x']
KeyError: 'x'
Be careful when serializing tuplemaps. Using pickle is OK, but json,
yaml, or similar will serialize tuplemap to a list of key names and
lose the mapping. Instead use tuplemap.unpack() to get a dict:
>>> Point = tuplemap.frozentuplemap('Point', ('x', 'y'))
>>> partial = Point(y=1)
>>> tuplemap.unpack(partial)
{"y":1}
It's safe to unpack non-tuplemaps. They are returned as-is:
>>> tuplemap.unpack(1)
1
>>> tuplemap.unpack([1, 2, 3])
[1, 2, 3]
A tuple map has three key properties:
1. The interface is like a dict.
2. Memory usage is like a tuple.
3. When pickled and unpickled, the ordering and name of fields is
flexible, like a dict and unlike collections.namedtuple.
This new data structure was motivated by a want to reduce the
in-memory size of Khan Academy's content database, which contains tens
of thousands of dicts. Read all about it here:
http://engineering.khanacademy.org/posts/evolving-our-content-infrastructure.htm
When the keys and values are interned and heavily shared, like they
are in the frozen model store, the overhead of the container can be
significant. An dict (280 bytes), and an object with __slots__ (120
bytes with one property, +8 bytes per property) can't compete with a
tuple (56 bytes +8 bytes per entry). However, tuples aren't very
robust to code changes and it's difficult to evolve their
schema. Furthermore, changing many downstream callers that expect a
dict to use something else is time-consuming and error-prone.
A tuple map keeps the in-memory representation as small as a tuple,
while pickling and access behave like a dict.
From May 2016, here is data from switching the 'item' key's list of
Khan Academy's FrozenExercise.problem_types to other data types:
dict in-memory:41626912 pickled:8974467 zipped:3754914
tuplemap in-memory:11618840 pickled:6259246 zipped:2975254
namedtuple in-memory:11686976 pickled:5590038 zipped:2931181
"""
import sys
class _FrozenTupleMap(tuple):
__slots__ = ()
_fields = () # subclasses should define named fields
# This value is used to know a key isn't mapped to a value.
_unset = object()
def __new__(cls, *args, **kwargs):
"""Create new instance."""
# We enforce the same creation interface as dict(), and we
# further restrict to only known field names.
argdict = dict(*args, **kwargs)
# A key instance invariant: the backing tuple has len(cls._fields).
values = tuple(argdict.pop(field, cls._unset) for field in cls._fields)
if argdict:
raise TypeError('%s unexpected field names: %r'
% (cls.__name__, argdict.keys()))
return tuple.__new__(cls, values)
def __repr__(self):
"""Return a nicely formatted representation string."""
return '%s(%s)' % (
self.__class__.__name__,
', '.join('%s=%r' % item for item in self.iteritems()))
def __reduce__(self):
"""Return self as a plain tuple of key-value pairs. Used by pickle."""
# __reduce__ provides fine-grained pickle control. The first
# argument is our class, a callable, and the second argument
# will be passed to it as *args on reconstruction.
newargs = (tuple((f, self[f]) for f in self._fields if f in self),)
return (self.__class__, newargs)
def get(self, key, default=None):
'D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.'
try:
return self[key]
except KeyError:
return default
def __contains__(self, key):
try:
self[key]
except KeyError:
return False
else:
return True
def __eq__(self, other):
if isinstance(other, _FrozenTupleMap):
return super(_FrozenTupleMap, self).__eq__(other)
elif isinstance(other, dict):
return dict(self.items()) == other
else:
return False
def __ne__(self, other):
return not (self == other)
# Support functions for making dicts with tuplemap.unpack().
def _unpacker(self):
return {}
def _unpack(self):
"""Return self as a dict."""
result = dict(self.iteritems())
result.update(self._unpacker())
return result
# The definitions from collections.Mapping require __len__,
# __iter__, and __getitem__.
def __len__(self):
return super(_FrozenTupleMap, self).__len__() - self.count(self._unset)
def __iter__(self):
for i, key in enumerate(self._fields):
val = super(_FrozenTupleMap, self).__getitem__(i)
if val is not self._unset:
yield key
def __getitem__(self, key):
if not isinstance(key, str):
raise TypeError('%s keys must be strings, not %s'
% (self.__class__.__name__,
key.__class__.__name__))
try:
# For tuples with fewer than maybe 5 items, this is
# actually faster than a hashed dict lookup!
field_index = self._fields.index(key)
except ValueError:
raise KeyError(key)
val = super(_FrozenTupleMap, self).__getitem__(field_index)
if val is self._unset:
raise KeyError(key)
return val
# We may someday want to implement __setitem__ and __delitem__ and
# use definitions from collections.MutableMapping. Maybe in a
# subclass called _TupleMap, or maybe we invert the hierarchy to
# have the frozen version be the subclass. For now, though, it's
# kind of nice to be immutable.
def __setitem__(self, key, value):
# Overridden to avoid the awkwardness of being able to set
# numeric indices that we inherit from our tuple superclass.
raise NotImplementedError()
def __delitem__(self, key):
raise NotImplementedError()
# Finally, the dict API. Taken from collections.Mapping.__dict__.
def iterkeys(self):
return iter(self)
def itervalues(self):
for i in xrange(len(self._fields)):
val = super(_FrozenTupleMap, self).__getitem__(i)
if val is not self._unset:
yield val
def iteritems(self):
for i, key in enumerate(self._fields):
val = super(_FrozenTupleMap, self).__getitem__(i)
if val is not self._unset:
yield (key, val)
def keys(self):
return list(self.iterkeys())
def items(self):
return [(key, self[key]) for key in self]
def values(self):
return list(self.itervalues())
def frozentuplemap(typename, field_names, unpacker=None):
"""Return a new immutable tuplemap with named fields.
If set, `unpacker` must be a function that accepts one argument, a
tuplemap instance, and returns a value that will be passed into
dict.update(). The function is used to augment the dict returned
by tuplemap.unpack(). For example, to further unpack contained
tuplemaps.
"""
for field in field_names:
assert isinstance(field, str)
_fields = tuple(field_names)
typedict = {
'__slots__': (),
'_fields': _fields,
'__doc__': '%s(%s)' % (typename, repr(_fields).replace("'", "")[1:-1]),
}
if unpacker:
typedict['_unpacker'] = unpacker
result = type(typename, (_FrozenTupleMap,), typedict)
# Setting __module__ is taken from collection.py in namedtuple().
# For pickling to work, the __module__ variable needs to be set to
# the frame where the named tuple is created. Bypass this step in
# environments where sys._getframe is not defined (Jython for
# example) or sys._getframe is not defined for arguments greater
# than 0 (IronPython).
try:
result.__module__ = sys._getframe(1).f_globals.get('__name__',
'__main__')
except (AttributeError, ValueError):
pass
return result
def unpack(maybe_tuplemap):
"""Return a dict if passed a tuplemap, otherwise return the argument.
If the argument is a tuplemap it may recursively unpack contained
tuplemaps. See frozentuplemap()'s `unpacker` parameter.
This is useful when a tuplemap used as a memory optimization needs
to be serialized. For example, callers of json.dumps don't have a
hook to control serialization for basic types like tuple and
should call unpack() first.
"""
if isinstance(maybe_tuplemap, _FrozenTupleMap):
return maybe_tuplemap._unpack()
return maybe_tuplemap
import pickle
import unittest
from pympler import asizeof
import tuplemap
Point = tuplemap.frozentuplemap('Point', ('x', 'y'))
class TestTupleMap(unittest.TestCase):
def _assert_x1y2_point(self, pt):
self.assertEqual(pt, {'x': 1, 'y': 2})
self.assertEqual(pt['x'], 1)
self.assertEqual(pt['y'], 2)
with self.assertRaises(KeyError):
pt['z']
self.assertIn('x', pt)
self.assertNotIn('z', pt)
self.assertEqual(pt.get('x'), 1)
self.assertEqual(pt.get('y'), 2)
self.assertIsNone(pt.get('z'))
self.assertEqual(pt.items(), [('x', 1), ('y', 2)])
self.assertEqual(pt.values(), [1, 2])
self.assertEqual(pt.keys(), ['x', 'y'])
self.assertEqual(list(pt.iteritems()), [('x', 1), ('y', 2)])
self.assertEqual(list(pt.itervalues()), [1, 2])
self.assertEqual(list(pt.iterkeys()), ['x', 'y'])
def test_interface(self):
self.assertEqual(Point(), {})
pt = Point(x=1, y=2)
self._assert_x1y2_point(pt)
# Since the superclass is a tuple, there are still some
# tuple-isms available.
self.assertEqual(pt.count(2), 1)
self.assertEqual(pt.index(1), 0)
with self.assertRaises(ValueError):
pt.index(3)
# But it behaves like a dict here.
self.assertEqual(Point(), {})
self.assertEqual(Point(x=1), {'x': 1})
self.assertNotEqual(Point(), tuple())
self.assertNotEqual(Point(x=1), (1,))
self.assertNotEqual(Point(x=1, y=2), (1, 2))
def test_pickling(self):
pt = Point(x=1, y=2)
pt_again = pickle.loads(pickle.dumps(pt, pickle.HIGHEST_PROTOCOL))
self._assert_x1y2_point(pt_again)
# Ensure that older protocols work (this errored back when
# tuplemap implemented __getnewargs__ rather than __reduce__).
pt_again = pickle.loads(pickle.dumps(pt, 0))
self._assert_x1y2_point(pt_again)
def test_constructor(self):
pt = Point(y=2, x=1)
self.assertEqual(pt['x'], 1)
self.assertEqual(pt['y'], 2)
self.assertEqual(Point([('x', 1)])['x'], 1)
self.assertEqual(Point({'x': 1})['x'], 1)
self.assertEqual(Point([('x', 1)], x=0)['x'], 0)
def test_new_definition(self):
myclass = tuplemap.frozentuplemap('myclass', ('x',))
self.assertEqual(myclass().__slots__, tuple())
self.assertEqual(myclass()._fields, tuple(['x']))
# Ensure we use __slots__ and don't incur __dict__ overhead.
with self.assertRaises(AttributeError):
myclass().__dict__
def test_eq(self):
pt1 = Point(y=2, x=1)
pt2 = Point(x=1, y=2)
pt3 = Point(x=2, y=1)
self.assertEqual(pt1, pt2)
self.assertNotEqual(pt1, pt3)
self.assertEqual(pt1, {'x': 1, 'y': 2})
self.assertNotEqual(pt3, {'x': 1, 'y': 2})
self.assertNotEqual(pt1, ('x', 'y'))
self.assertNotEqual(pt1, (1, 2))
def test_memory_size(self):
# Roughly test the core purpose of tuple map, in-memory size.
solar_system_distances = dict((('Mercury', 0.39), ('Venus', 0.723),
('Earth', 1.0), ('Mars', 1.524),
('Jupiter', 5.203), ('Saturn', 9.539),
('Uranus', 19.18), ('Neptune', 30.06),
('Pluto (now a dwarf planet)', 39.53)))
myclass = tuplemap.frozentuplemap('myclass',
tuple(solar_system_distances))
asdicts = [dict(**solar_system_distances) for _ in xrange(1000)]
astuplemaps = [myclass(**solar_system_distances) for _ in xrange(1000)]
# On chris' Macbook Pro dicts are ~1MB and tuplemaps ~130KB.
self.assertGreater(asizeof.asizeof(asdicts), 950 * 1000)
self.assertLess(asizeof.asizeof(astuplemaps), 150 * 1000)
class TestUnpack(unittest.TestCase):
def test_non_tuplemap(self):
self.assertEqual(1, tuplemap.unpack(1))
self.assertEqual([1, 2, 3], tuplemap.unpack([1, 2, 3]))
self.assertEqual({'x': 1}, tuplemap.unpack({'x': 1}))
obj = object()
self.assertEqual(id(obj), id(tuplemap.unpack(obj)))
def test_tuplemap(self):
self.assertEqual({'x': 1, 'y': 2}, tuplemap.unpack(Point(x=1, y=2)))
self.assertEqual({'x': 1}, tuplemap.unpack(Point(x=1)))
self.assertEqual({}, tuplemap.unpack(Point()))
# Test unpacking contained tuplemaps.
innerclass = tuplemap.frozentuplemap('innerclass', ('x',))
outerclass = tuplemap.frozentuplemap('outerclass', ('inner',),
unpacker=lambda self: {
'inner': [tuplemap.unpack(i) for i in self['inner']],
})
obj = outerclass(inner=[innerclass(x=1), innerclass(x=2)])
self.assertEqual({'inner': [{'x': 1}, {'x': 2}]}, tuplemap.unpack(obj))
@chrisklaiber
Copy link
Author

You can run the tests with python -m unittest namedmap_test tuplemap_test.

For context, see @wchargin's excellent article on Khan Academy's content infrastructure improvements here!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment