Skip to content

Instantly share code, notes, and snippets.

@teepark
Created September 9, 2010 22:45
Show Gist options
  • Star 73 You must be signed in to star a gist
  • Fork 27 You must be signed in to fork a gist
  • Save teepark/572734 to your computer and use it in GitHub Desktop.
Save teepark/572734 to your computer and use it in GitHub Desktop.
a pure-python B tree and B+ tree implementation
import bisect
import itertools
import operator
class _BNode(object):
__slots__ = ["tree", "contents", "children"]
def __init__(self, tree, contents=None, children=None):
self.tree = tree
self.contents = contents or []
self.children = children or []
if self.children:
assert len(self.contents) + 1 == len(self.children), \
"one more child than data item required"
def __repr__(self):
name = getattr(self, "children", 0) and "Branch" or "Leaf"
return "<%s %s>" % (name, ", ".join(map(str, self.contents)))
def lateral(self, parent, parent_index, dest, dest_index):
if parent_index > dest_index:
dest.contents.append(parent.contents[dest_index])
parent.contents[dest_index] = self.contents.pop(0)
if self.children:
dest.children.append(self.children.pop(0))
else:
dest.contents.insert(0, parent.contents[parent_index])
parent.contents[parent_index] = self.contents.pop()
if self.children:
dest.children.insert(0, self.children.pop())
def shrink(self, ancestors):
parent = None
if ancestors:
parent, parent_index = ancestors.pop()
# try to lend to the left neighboring sibling
if parent_index:
left_sib = parent.children[parent_index - 1]
if len(left_sib.contents) < self.tree.order:
self.lateral(
parent, parent_index, left_sib, parent_index - 1)
return
# try the right neighbor
if parent_index + 1 < len(parent.children):
right_sib = parent.children[parent_index + 1]
if len(right_sib.contents) < self.tree.order:
self.lateral(
parent, parent_index, right_sib, parent_index + 1)
return
center = len(self.contents) // 2
sibling, push = self.split()
if not parent:
parent, parent_index = self.tree.BRANCH(
self.tree, children=[self]), 0
self.tree._root = parent
# pass the median up to the parent
parent.contents.insert(parent_index, push)
parent.children.insert(parent_index + 1, sibling)
if len(parent.contents) > parent.tree.order:
parent.shrink(ancestors)
def grow(self, ancestors):
parent, parent_index = ancestors.pop()
minimum = self.tree.order // 2
left_sib = right_sib = None
# try to borrow from the right sibling
if parent_index + 1 < len(parent.children):
right_sib = parent.children[parent_index + 1]
if len(right_sib.contents) > minimum:
right_sib.lateral(parent, parent_index + 1, self, parent_index)
return
# try to borrow from the left sibling
if parent_index:
left_sib = parent.children[parent_index - 1]
if len(left_sib.contents) > minimum:
left_sib.lateral(parent, parent_index - 1, self, parent_index)
return
# consolidate with a sibling - try left first
if left_sib:
left_sib.contents.append(parent.contents[parent_index - 1])
left_sib.contents.extend(self.contents)
if self.children:
left_sib.children.extend(self.children)
parent.contents.pop(parent_index - 1)
parent.children.pop(parent_index)
else:
self.contents.append(parent.contents[parent_index])
self.contents.extend(right_sib.contents)
if self.children:
self.children.extend(right_sib.children)
parent.contents.pop(parent_index)
parent.children.pop(parent_index + 1)
if len(parent.contents) < minimum:
if ancestors:
# parent is not the root
parent.grow(ancestors)
elif not parent.contents:
# parent is root, and its now empty
self.tree._root = left_sib or self
def split(self):
center = len(self.contents) // 2
median = self.contents[center]
sibling = type(self)(
self.tree,
self.contents[center + 1:],
self.children[center + 1:])
self.contents = self.contents[:center]
self.children = self.children[:center + 1]
return sibling, median
def insert(self, index, item, ancestors):
self.contents.insert(index, item)
if len(self.contents) > self.tree.order:
self.shrink(ancestors)
def remove(self, index, ancestors):
minimum = self.tree.order // 2
if self.children:
# try promoting from the right subtree first,
# but only if it won't have to resize
additional_ancestors = [(self, index + 1)]
descendent = self.children[index + 1]
while descendent.children:
additional_ancestors.append((descendent, 0))
descendent = descendent.children[0]
if len(descendent.contents) > minimum:
ancestors.extend(additional_ancestors)
self.contents[index] = descendent.contents[0]
descendent.remove(0, ancestors)
return
# fall back to the left child
additional_ancestors = [(self, index)]
descendent = self.children[index]
while descendent.children:
additional_ancestors.append(
(descendent, len(descendent.children) - 1))
descendent = descendent.children[-1]
ancestors.extend(additional_ancestors)
self.contents[index] = descendent.contents[-1]
descendent.remove(len(descendent.children) - 1, ancestors)
else:
self.contents.pop(index)
if len(self.contents) < minimum and ancestors:
self.grow(ancestors)
class _BPlusLeaf(_BNode):
__slots__ = ["tree", "contents", "data", "next"]
def __init__(self, tree, contents=None, data=None, next=None):
self.tree = tree
self.contents = contents or []
self.data = data or []
self.next = next
assert len(self.contents) == len(self.data), "one data per key"
def insert(self, index, key, data, ancestors):
self.contents.insert(index, key)
self.data.insert(index, data)
if len(self.contents) > self.tree.order:
self.shrink(ancestors)
def lateral(self, parent, parent_index, dest, dest_index):
if parent_index > dest_index:
dest.contents.append(self.contents.pop(0))
dest.data.append(self.data.pop(0))
parent.contents[dest_index] = self.contents[0]
else:
dest.contents.insert(0, self.contents.pop())
dest.data.insert(0, self.data.pop())
parent.contents[parent_index] = dest.contents[0]
def split(self):
center = len(self.contents) // 2
median = self.contents[center - 1]
sibling = type(self)(
self.tree,
self.contents[center:],
self.data[center:],
self.next)
self.contents = self.contents[:center]
self.data = self.data[:center]
self.next = sibling
return sibling, sibling.contents[0]
def remove(self, index, ancestors):
minimum = self.tree.order // 2
if index >= len(self.contents):
self, index = self.next, 0
key = self.contents[index]
# if any leaf that could accept the key can do so
# without any rebalancing necessary, then go that route
current = self
while current is not None and current.contents[0] == key:
if len(current.contents) > minimum:
if current.contents[0] == key:
index = 0
else:
index = bisect.bisect_left(current.contents, key)
current.contents.pop(index)
current.data.pop(index)
return
current = current.next
self.grow(ancestors)
def grow(self, ancestors):
minimum = self.tree.order // 2
parent, parent_index = ancestors.pop()
left_sib = right_sib = None
# try borrowing from a neighbor - try right first
if parent_index + 1 < len(parent.children):
right_sib = parent.children[parent_index + 1]
if len(right_sib.contents) > minimum:
right_sib.lateral(parent, parent_index + 1, self, parent_index)
return
# fallback to left
if parent_index:
left_sib = parent.children[parent_index - 1]
if len(left_sib.contents) > minimum:
left_sib.lateral(parent, parent_index - 1, self, parent_index)
return
# join with a neighbor - try left first
if left_sib:
left_sib.contents.extend(self.contents)
left_sib.data.extend(self.data)
parent.remove(parent_index - 1, ancestors)
return
# fallback to right
self.contents.extend(right_sib.contents)
self.data.extend(right_sib.data)
parent.remove(parent_index, ancestors)
class BTree(object):
BRANCH = LEAF = _BNode
def __init__(self, order):
self.order = order
self._root = self._bottom = self.LEAF(self)
def _path_to(self, item):
current = self._root
ancestry = []
while getattr(current, "children", None):
index = bisect.bisect_left(current.contents, item)
ancestry.append((current, index))
if index < len(current.contents) \
and current.contents[index] == item:
return ancestry
current = current.children[index]
index = bisect.bisect_left(current.contents, item)
ancestry.append((current, index))
present = index < len(current.contents)
present = present and current.contents[index] == item
return ancestry
def _present(self, item, ancestors):
last, index = ancestors[-1]
return index < len(last.contents) and last.contents[index] == item
def insert(self, item):
current = self._root
ancestors = self._path_to(item)
node, index = ancestors[-1]
while getattr(node, "children", None):
node = node.children[index]
index = bisect.bisect_left(node.contents, item)
ancestors.append((node, index))
node, index = ancestors.pop()
node.insert(index, item, ancestors)
def remove(self, item):
current = self._root
ancestors = self._path_to(item)
if self._present(item, ancestors):
node, index = ancestors.pop()
node.remove(index, ancestors)
else:
raise ValueError("%r not in %s" % (item, self.__class__.__name__))
def __contains__(self, item):
return self._present(item, self._path_to(item))
def __iter__(self):
def _recurse(node):
if node.children:
for child, item in zip(node.children, node.contents):
for child_item in _recurse(child):
yield child_item
yield item
for child_item in _recurse(node.children[-1]):
yield child_item
else:
for item in node.contents:
yield item
for item in _recurse(self._root):
yield item
def __repr__(self):
def recurse(node, accum, depth):
accum.append((" " * depth) + repr(node))
for node in getattr(node, "children", []):
recurse(node, accum, depth + 1)
accum = []
recurse(self._root, accum, 0)
return "\n".join(accum)
@classmethod
def bulkload(cls, items, order):
tree = object.__new__(cls)
tree.order = order
leaves = tree._build_bulkloaded_leaves(items)
tree._build_bulkloaded_branches(leaves)
return tree
def _build_bulkloaded_leaves(self, items):
minimum = self.order // 2
leaves, seps = [[]], []
for item in items:
if len(leaves[-1]) < self.order:
leaves[-1].append(item)
else:
seps.append(item)
leaves.append([])
if len(leaves[-1]) < minimum and seps:
last_two = leaves[-2] + [seps.pop()] + leaves[-1]
leaves[-2] = last_two[:minimum]
leaves[-1] = last_two[minimum + 1:]
seps.append(last_two[minimum])
return [self.LEAF(self, contents=node) for node in leaves], seps
def _build_bulkloaded_branches(self, (leaves, seps)):
minimum = self.order // 2
levels = [leaves]
while len(seps) > self.order + 1:
items, nodes, seps = seps, [[]], []
for item in items:
if len(nodes[-1]) < self.order:
nodes[-1].append(item)
else:
seps.append(item)
nodes.append([])
if len(nodes[-1]) < minimum and seps:
last_two = nodes[-2] + [seps.pop()] + nodes[-1]
nodes[-2] = last_two[:minimum]
nodes[-1] = last_two[minimum + 1:]
seps.append(last_two[minimum])
offset = 0
for i, node in enumerate(nodes):
children = levels[-1][offset:offset + len(node) + 1]
nodes[i] = self.BRANCH(self, contents=node, children=children)
offset += len(node) + 1
levels.append(nodes)
self._root = self.BRANCH(self, contents=seps, children=levels[-1])
class BPlusTree(BTree):
LEAF = _BPlusLeaf
def _get(self, key):
node, index = self._path_to(key)[-1]
if index == len(node.contents):
if node.next:
node, index = node.next, 0
else:
return
while node.contents[index] == key:
yield node.data[index]
index += 1
if index == len(node.contents):
if node.next:
node, index = node.next, 0
else:
return
def _path_to(self, item):
path = super(BPlusTree, self)._path_to(item)
node, index = path[-1]
while hasattr(node, "children"):
node = node.children[index]
index = bisect.bisect_left(node.contents, item)
path.append((node, index))
return path
def get(self, key, default=None):
try:
return self._get(key).next()
except StopIteration:
return default
def getlist(self, key):
return list(self._get(key))
def insert(self, key, data):
path = self._path_to(key)
node, index = path.pop()
node.insert(index, key, data, path)
def remove(self, key):
path = self._path_to(key)
node, index = path.pop()
node.remove(index, path)
__getitem__ = get
__setitem__ = insert
__delitem__ = remove
def __contains__(self, key):
for item in self._get(key):
return True
return False
def iteritems(self):
node = self._root
while hasattr(node, "children"):
node = node.children[0]
while node:
for pair in itertools.izip(node.contents, node.data):
yield pair
node = node.next
def iterkeys(self):
return itertools.imap(operator.itemgetter(0), self.iteritems())
def itervalues(self):
return itertools.imap(operator.itemgetter(1), self.iteritems())
__iter__ = iterkeys
def items(self):
return list(self.iteritems())
def keys(self):
return list(self.iterkeys())
def values(self):
return list(self.itervalues())
def _build_bulkloaded_leaves(self, items):
minimum = self.order // 2
leaves, seps = [[]], []
for item in items:
if len(leaves[-1]) >= self.order:
seps.append(item)
leaves.append([])
leaves[-1].append(item)
if len(leaves[-1]) < minimum and seps:
last_two = leaves[-2] + leaves[-1]
leaves[-2] = last_two[:minimum]
leaves[-1] = last_two[minimum:]
seps.append(last_two[minimum])
leaves = [self.LEAF(
self,
contents=[p[0] for p in pairs],
data=[p[1] for p in pairs])
for pairs in leaves]
for i in xrange(len(leaves) - 1):
leaves[i].next = leaves[i + 1]
return leaves, [s[0] for s in seps]
import random
import unittest
class BTreeTests(unittest.TestCase):
def test_additions(self):
bt = BTree(20)
l = range(2000)
for i, item in enumerate(l):
bt.insert(item)
self.assertEqual(list(bt), l[:i + 1])
def test_bulkloads(self):
bt = BTree.bulkload(range(2000), 20)
self.assertEqual(list(bt), range(2000))
def test_removals(self):
bt = BTree(20)
l = range(2000)
map(bt.insert, l)
rand = l[:]
random.shuffle(rand)
while l:
self.assertEqual(list(bt), l)
rem = rand.pop()
l.remove(rem)
bt.remove(rem)
self.assertEqual(list(bt), l)
def test_insert_regression(self):
bt = BTree.bulkload(range(2000), 50)
for i in xrange(100000):
bt.insert(random.randrange(2000))
class BPlusTreeTests(unittest.TestCase):
def test_additions_sorted(self):
bt = BPlusTree(20)
l = range(2000)
for item in l:
bt.insert(item, str(item))
for item in l:
self.assertEqual(str(item), bt[item])
self.assertEqual(l, list(bt))
def test_additions_random(self):
bt = BPlusTree(20)
l = range(2000)
random.shuffle(l)
for item in l:
bt.insert(item, str(item))
for item in l:
self.assertEqual(str(item), bt[item])
self.assertEqual(range(2000), list(bt))
def test_bulkload(self):
bt = BPlusTree.bulkload(zip(range(2000), map(str, range(2000))), 20)
self.assertEqual(list(bt), range(2000))
self.assertEqual(
list(bt.iteritems()),
zip(range(2000), map(str, range(2000))))
if __name__ == '__main__':
unittest.main()
@majegozo
Copy link

majegozo commented Jul 3, 2013

Hi guys,

when i use the BPlusTree, and i try to delete one element, I find an error. For example,

in

def test_additions_sorted(self):
bt = BPlusTree(20)
l = range(2000)

    for item in l:
        bt.insert(item, str(item))

    for item in l:
        self.assertEqual(str(item), bt[item])

    for item in l:
        bt.remove(item)

    self.assertEqual(l, list(bt))

I have added the list removal and executing i get:

ERROR: test_additions_sorted (main.BPlusTreeTests)

Traceback (most recent call last):
File "btree.py", line 557, in test_additions_sorted
bt.remove(item)
File "btree.py", line 443, in remove
node.remove(index, path)
File "btree.py", line 222, in remove
self.grow(ancestors)
File "btree.py", line 253, in grow
parent.remove(parent_index, ancestors)
File "btree.py", line 136, in remove
while descendent.children:
AttributeError: children

What can be happening?

Thanks in advance.

Copy link

ghost commented Aug 2, 2013

The problem seems to be that internally, _BNode.remove() is being called instead of _BPlusLeaf.remove().

@slashsorin
Copy link

Hi,
I see that the "next" is implemented at the linked list leaves-level of the tree. What about the "previous"? How could that be implemented with a double linked list?

@codecakes
Copy link

@teepark was thinking of merging some parts of the module to my work here which im working on here:
https://github.com/codecakes/BST ; still needs debugging but feel free to try out and maybe I would merge the best features from here and mine for a cleaner design.

@urban-1
Copy link

urban-1 commented Feb 26, 2017

Hi all,

Anyone else having problems when removing from a tree with order > 1 and 1 item? Any one knows how to fix this?

def test_delete(self):
    """
    This breaks remove function...
    """
    bt = BPlusTree(2)
    bt.insert(1000, "Test")
    bt.remove(1000)
    print(bt)

The problem seems to be at line 224 (grow function) which I am not sure if it should be called in this case...

UPDATE:

Changed line 212 to:

        if len(current.contents) > minimum or len(ancestors) == 0:

and seems working...

@SimonCqk
Copy link

SimonCqk commented Feb 2, 2018

hey guys, I found a bug, from line 148 to line 154:

while descendent.children:
                additional_ancestors.append(
                        (descendent, len(descendent.children) - 1))
                descendent = descendent.children[-1]
ancestors.extend(additional_ancestors)
self.contents[index] = descendent.contents[-1]
descendent.remove(len(descendent.children) - 1, ancestors)

when while break, descendent must be a BPlusLeaf, whitch has no attribute children.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment