# Here's a probably-not-new data structure I discovered after implementing weight-balanced trees with | |
# amortized subtree rebuilds (e.g. http://jeffe.cs.illinois.edu/teaching/algorithms/notes/10-scapegoat-splay.pdf) | |
# and realizing it was silly to turn a subtree into a sorted array with an in-order traversal only as an | |
# aid in constructing a balanced subtree in linear time. Why not replace the subtree by the sorted array and | |
# use binary search when hitting that leaf array? Then you'd defer any splitting of the array until inserts and | |
# deletes hit that leaf array. Only in hindsight did I realize this is similar to the rope data structure for strings. | |
# Unlike ropes, it's a key-value search tree for arbitrary ordered keys. | |
# | |
# The main attraction of this data structure is its simplicity (on par with a standard weight-balanced tree) and that it | |
# coalesces subtrees into contiguous arrays, which reduces memory overhead and boosts the performance of in-order traversals | |
# and range queries. | |
# | |
# This chain tree has a worst-case search time of O(log n) and insertion and deletion take O(log n) amortized time. The | |
# worst-case time for insertions/deletions is O(n), similar to table doubling for dynamic arrays, but the spikes | |
# for rebuilds are usually much smaller than that since they are confined to subtrees. | |
# | |
# In these bounds the number n is the number of live items + the number of deletions. Past deletions can affect the | |
# time taken by future operations because deleted items initially remain in interior nodes as tombstones/markers. | |
# There's a standard amortization trick to get rid of this O(log(deletions)) tax: You just rebuild the whole tree if | |
# the fraction of tombstones gets too high; you've probably seen this in open-addressed hash tables. The amortized cost | |
# of this global rebuild is only O(1) per deletion. But since tombstones are skipped when we rebuild a subtree, this | |
# worst-case O(log(deletions)) tax is usually much smaller. A lot of databases handle B-tree deletions in a similar way | |
# by letting nodes underflow below the B/2 limit, and they can't afford the O(n) rebuild time since n could be trillions. | |
# You can show that this incurs a similar O(log(deletions)) tax: http://sidsen.azurewebsites.net/papers/relaxed-b-tree-tods.pdf | |
# | |
# If you implement this in a language without garbage collection, you'll need to do your own reference counting for leaf arrays | |
# since they are shared among different chains. This also illustrates a potential issue with space bloat. Like with ropes, the | |
# array sharing means a single item can keep alive an arbitrarily large array. | |
# | |
# This is probably the best engineering argument for amortized global rebuilds: Instead of worrying about tombstones directly, | |
# just track the amount of wasted space and trigger a global rebuild when it reaches too large a fraction of the tree's total space. | |
# That gives you a worst-case space bound of O(n) with O(1) additional amortized cost per insertion/deletion. Waste is introduced | |
# both by deletions (waste += 1) and by insertions when a subtree is rebuilt (waste += subtree_size if the shared array is still live). | |
# We charge $1 of the global rebuild cost to each deletion. For an insertion that triggers a subtree rebuild, we charge $1 to the | |
# subtree_size item copies during the subtree rebuild. Due to weight balance, the sibling subtree is also O(subtree_size) in size. | |
# Hence we've charged enough to pay for the eventual global rebuild of both the rebuilt and non-rebuilt subtrees. | |
# | |
# NOTE: This version doesn't store values in nodes, so they're just key splitters. Splits are around medians from sorted chains, | |
# so this is very close to the original inspiration of weight-balanced trees with lazy splitting. Some of the changes also makes | |
# it look closer to B-trees (specifically B+trees since values only live in leaves). | |
from random import randrange, seed | |
from itertools import islice | |
from bisect import bisect_left | |
local_rebuild_cost = 0 | |
global_rebuild_cost = 0 | |
DELETED = object() | |
SMALL = 32 | |
HALF_SMALL = 16 | |
class Chain: | |
def __init__(self, keys, values, start, end): | |
assert len(keys) == len(values) | |
assert 0 <= start <= end <= len(keys) | |
self.keys = keys | |
self.values = values | |
self.start = start | |
self.end = end | |
def weight(self): | |
return self.end - self.start | |
def __iter__(self): | |
for i in range(self.start, self.end): | |
yield self.keys[i], self.values[i] | |
def emit(self, keys, values): | |
keys.extend(self.keys[self.start:self.end]) | |
values.extend(self.values[self.start:self.end]) | |
def slice(self, start, end): | |
return Chain(self.keys, self.values, start, end) | |
def key_index(self, key): | |
i = bisect_left(self.keys, key, self.start, self.end) | |
return i, i < self.end and self.keys[i] == key | |
def split(self): | |
i = (self.start + self.end) // 2 | |
return i, self.keys[i], self.slice(self.start, i + 1), self.slice(i + 1, self.end) | |
def search(self, key, none): | |
i, found = self.key_index(key) | |
return self.values[i] if found else none | |
def is_small_array(self): | |
return self.end - self.start == len(self.keys) <= SMALL | |
def insert(self, key, value): | |
i, found = self.key_index(key) | |
if found: | |
self.values[i] = value | |
return self | |
# This is a fast path for small arrays or appends. No point in splitting when it's cheaper to just do it in place. | |
# It's especially helpful since appends are common and would otherwise provoke the maximal number of median splits. | |
# It also helps with insert locality: if you do an insert at a key and split around it and then the left subtree | |
# is eventually rebuilt into its own dynamic array then you can do fast in-place appends at the split location, | |
# so every split which is adjacent to a complete chain effectively becomes a fast insert cursor. | |
if self.is_small_array() or i == len(self.keys): | |
return self.insert_in_place(key, value, i) | |
return self.insert_at(key, value, i) | |
def insert_in_place(self, key, value, i): | |
self.keys.insert(i, key) | |
self.values.insert(i, value) | |
self.end += 1 | |
return self | |
def insert_at(self, key, value, i): | |
# We use HALF_SMALL so that new arrays can double in size before needing splits. | |
if self.weight() <= HALF_SMALL: | |
return rebuild(self).insert_in_place(key, value, i - self.start) | |
split, split_key, left, right = self.split() | |
if i <= split: | |
left = left.insert_at(key, value, i) | |
else: | |
right = right.insert_at(key, value, i) | |
return Node(split_key, left, right) | |
def delete(self, key): | |
i, found = self.key_index(key) | |
if not found: | |
return self | |
# This is a fast path for deleting at the start/end of a chain; you might have seen the same idea for ropes. | |
if i == self.start: | |
self.start += 1 | |
return self | |
elif i == self.end - 1: | |
self.end -= 1 | |
return self | |
return self.delete_at(key, i) | |
def delete_at(self, key, i): | |
if self.weight() <= SMALL: | |
self.keys[i:self.end-1] = self.keys[i+1:self.end] | |
self.values[i:self.end-1] = self.values[i+1:self.end] | |
self.end -= 1 | |
return self | |
split, split_key, left, right = self.split() | |
if i <= split: | |
left = left.delete_at(key, i) | |
else: | |
right = right.delete_at(key, i) | |
return Node(split_key, left, right) | |
def rebuild(node): | |
keys = [] | |
values = [] | |
node.emit(keys, values) | |
return Chain(keys, values, 0, len(keys)) | |
class Node: | |
def __init__(self, key, left, right): | |
self.key = key | |
self.left = left | |
self.right = right | |
self.cached_weight = 1 + left.weight() + right.weight() | |
def weight(self): | |
return self.cached_weight | |
def __iter__(self): | |
yield from self.left | |
yield from self.right | |
def emit(self, keys, values): | |
self.left.emit(keys, values) | |
self.right.emit(keys, values) | |
def search(self, key, none): | |
if key <= self.key: | |
return self.left.search(key, none) | |
else: | |
return self.right.search(key, none) | |
def balance(self): | |
weights = self.left.weight(), self.right.weight() | |
self.cached_weight = 1 + sum(weights) | |
# Allow up to 1:2 weight balance before a local rebuild. | |
if max(weights) <= 2 * min(weights): | |
return self | |
else: | |
global local_rebuild_cost | |
local_rebuild_cost += self.cached_weight | |
return rebuild(self) | |
def insert(self, key, value): | |
if key <= self.key: | |
self.left = self.left.insert(key, value) | |
else: | |
self.right = self.right.insert(key, value) | |
return self.balance() | |
def delete(self, key): | |
if key <= self.key: | |
self.left = self.left.delete(key) | |
else: | |
self.right = self.right.delete(key) | |
return self.balance() | |
class ChainTree: | |
def __init__(self): | |
self.root = Chain([], [], 0, 0) | |
self.account = 0 | |
def flatten(self): | |
self.root = rebuild(self.root) | |
self.account = 0 | |
def charge(self): | |
self.account += 1 | |
cost = self.root.weight() | |
# This is a simple amortization scheme for global rebuilds that's guaranteed not to overcharge. | |
if self.account >= cost: | |
global global_rebuild_cost | |
global_rebuild_cost += cost | |
self.flatten() | |
def __iter__(self): | |
return iter(self.root) | |
def search(self, key, none=None): | |
return self.root.search(key, none) | |
def insert(self, key, value): | |
self.root = self.root.insert(key, value) | |
self.charge() | |
def delete(self, key): | |
self.root = self.root.delete(key) | |
self.charge() | |
# Example | |
seed(0) | |
tree = ChainTree() | |
n = 100000 | |
d = {} | |
print("Inserting") | |
for i in range(n // 4): | |
key = randrange(2 * n) | |
# print("%s => %s" % (key, i)) | |
d[key] = i | |
tree.insert(key, i) | |
for i in range(n // 4): | |
key = randrange(n - 10, n + 10) | |
# print("%s => %s" % (key, i)) | |
d[key] = i | |
tree.insert(key, i) | |
for i in range(n // 4): | |
key = randrange(2 * n) | |
# print("%s => %s" % (key, i)) | |
d[key] = i | |
tree.insert(key, i) | |
for i in range(n // 4): | |
key = randrange(n - 10, n + 10) | |
# print("%s => %s" % (key, i)) | |
d[key] = i | |
tree.insert(key, i) | |
print("Local rebuild cost:", local_rebuild_cost) | |
print("Global rebuild cost:", global_rebuild_cost) | |
#tree.flatten() | |
print("Searching") | |
for key, value in d.items(): | |
assert tree.search(key) == value | |
print("Iterating") | |
for key, value in tree: | |
assert d[key] == value | |
print("Deleting") | |
for i in range(n // 2): | |
key = randrange(2 * n) | |
if key in d: | |
# print("Deleting", key) | |
del d[key] | |
tree.delete(key) | |
assert sorted(d.items()) == list(tree) | |
print("Local rebuild cost:", local_rebuild_cost) | |
print("Global rebuild cost:", global_rebuild_cost) | |
#for key, value in tree: | |
# print(key, value) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment