joseanpg/BB.sml

## BB.sml
(*

http://groups.csail.mit.edu/mac/users/adams/BB/BB.sml

    Copyright 1992-1996 Stephen Adams.

    This software may be used freely provided that:
      1. This copyright notice is attached to any copy, derived work,
         or work including all or part of this software.
      2. Any derived work must contain a prominent notice stating that
         it has been altered from the original.

*)

(* Address:  Electronics & Computer Science
             University of Southampton
         Southampton  SO9 5NH
	     Great Britian
   E-mail:   sra@ecs.soton.ac.uk

   Comments:

     1.  The implementation is based on Binary search trees of Bounded
         Balance, similar to Nievergelt & Reingold, SIAM J. Computing
         2(1), March 1973.  The main advantage of these trees is that
         they keep the size of the tree in the node, giving a constant
         time size operation.

     2.  The bounded balance criterion is simpler than N&R's alpha.
         Simply, one subtree must not have more than `weight' times as
         many elements as the opposite subtree.  Rebalancing is
         guaranteed to reinstate the criterion for weight>2.23, but
         the occasional incorrect behaviour for weight=2 is not
         detrimental to performance.

     3.  There are two implementations of union.  The default,
         hedge_union, is much more complex and usually 20% faster.  I
         am not sure that the performance increase warrants the
         complexity (and time it took to write), but I am leaving it
         in for the competition.  It is derived from the original
         union by replacing the split_lt(gt) operations with a lazy
         version. The `obvious' version is called old_union.
*)

structure B (*: INTSET*) =
    struct

	local

	    type T = int
	    val lt : T*T->bool = op <

	    (* weight is a parameter to the rebalancing process. *)
	    val weight:int = 3

	    datatype  Set = E | T of T * int * Set * Set

	    fun size E = 0
	      | size (T(_,n,_,_)) = n

	    (*fun N(v,l,r) = T(v,1+size(l)+size(r),l,r)*)
	    fun N(v,E,              E)               = T(v,1,E,E)
	      | N(v,E,              r as T(_,n,_,_)) = T(v,n+1,E,r)
	      | N(v,l as T(_,n,_,_),E)               = T(v,n+1,l,E)
	      | N(v,l as T(_,n,_,_),r as T(_,m,_,_)) = T(v,n+m+1,l,r)

	    fun single_L (a,x,T(b,_,y,z)) = N(b,N(a,x,y),z)
	      | single_L _ = raise Match
	    fun single_R (b,T(a,_,x,y),z) = N(a,x,N(b,y,z))
	      | single_R _ = raise Match
	    fun double_L (a,w,T(c,_,T(b,_,x,y),z)) = N(b,N(a,w,x),N(c,y,z))
	      | double_L _ = raise Match
	    fun double_R (c,T(a,_,w,T(b,_,x,y)),z) = N(b,N(a,w,x),N(c,y,z))
	      | double_R _ = raise Match

	    fun T' (v,E,E) = T(v,1,E,E)
	      | T' (v,E,r as T(_,_,E,E))     = T(v,2,E,r)
	      | T' (v,l as T(_,_,E,E),E)     = T(v,2,l,E)

	      | T' (p as (_,E,T(_,_,T(_,_,_,_),E))) = double_L p
	      | T' (p as (_,T(_,_,E,T(_,_,_,_)),E)) = double_R p

	      (* these cases almost never happen with small weight*)
	      | T' (p as (_,E,T(_,_,T(_,ln,_,_),T(_,rn,_,_)))) =
		if ln<rn then single_L p else double_L p
	      | T' (p as (_,T(_,_,T(_,ln,_,_),T(_,rn,_,_)),E)) =
		if ln>rn then single_R p else double_R p

	      | T' (p as (_,E,T(_,_,E,_)))  = single_L p
	      | T' (p as (_,T(_,_,_,E),E))  = single_R p

	      | T' (p as (v,l as T(lv,ln,ll,lr),r as T(rv,rn,rl,rr))) =
		if rn>=weight*ln then (*right is too big*)
		    let val rln = size rl
			val rrn = size rr
		    in
			if rln < rrn then  single_L p  else  double_L p
		    end

		else if ln>=weight*rn then  (*left is too big*)
		    let val lln = size ll
			val lrn = size lr
		    in
			if lrn < lln then  single_R p  else  double_R p
		    end

		else
	             T(v,ln+rn+1,l,r)

	    fun add (E,x) = T(x,1,E,E)
	      | add (set as T(v,_,l,r),x) =
	        if lt(x,v) then T'(v,add(l,x),r)
		else if lt(v,x) then T'(v,l,add(r,x))
		     else set

	    fun concat3 (E,v,r) = add(r,v)
	      | concat3 (l,v,E) = add(l,v)
	      | concat3 (l as T(v1,n1,l1,r1), v, r as T(v2,n2,l2,r2)) =
		if weight*n1 < n2 then T'(v2,concat3(l,v,l2),r2)
		else if weight*n2 < n1 then T'(v1,l1,concat3(r1,v,r))
		     else N(v,l,r)

	    fun split_lt (E,x) = E
	      | split_lt (t as T(v,_,l,r),x) =
		if lt(x,v) then split_lt(l,x)
		else if lt(v,x) then concat3(l,v,split_lt(r,x))
		     else l

	    fun split_gt (E,x) = E
	      | split_gt (t as T(v,_,l,r),x) =
		if lt(v,x) then split_gt(r,x)
		else if lt(x,v) then concat3(split_gt(l,x),v,r)
		     else r

	    fun min (T(v,_,E,_)) = v
	      | min (T(v,_,l,_)) = min l
	      | min _            = raise Match

	    and delete' (E,r) = r
	      | delete' (l,E) = l
	      | delete' (l,r) = let val min_elt = min r in
                          		T'(min_elt,l,delmin r)
				end
	    and delmin (T(_,_,E,r)) = r
	      | delmin (T(v,_,l,r)) = T'(v,delmin l,r)
	      | delmin _ = raise Match

	    fun concat (E,  s2) = s2
	      | concat (s1, E)  = s1
	      | concat (t1 as T(v1,n1,l1,r1), t2 as T(v2,n2,l2,r2)) =
		if weight*n1 < n2 then T'(v2,concat(t1,l2),r2)
		else if weight*n2 < n1 then T'(v1,l1,concat(r1,t2))
		     else T'(min t2,t1, delmin t2)

	    fun fold(f,base,set) =
		let fun fold'(base,E) = base
		      | fold'(base,T(v,_,l,r)) = fold'(f(v,fold'(base,r)),l)
		in
		    fold'(base,set)
		end

	in

	    val empty = E

	    fun singleton x = T(x,1,E,E)


	    local
		fun trim (lo,hi,E) = E
		  | trim (lo,hi,s as T(v,_,l,r)) =
		    if  lt(lo,v)  then
			if  lt(v,hi)  then  s
			else  trim(lo,hi,l)
		    else trim(lo,hi,r)


		fun uni_bd (s,E,lo,hi) = s
		  | uni_bd (E,T(v,_,l,r),lo,hi) =
		     concat3(split_gt(l,lo),v,split_lt(r,hi))
		  | uni_bd (T(v,_,l1,r1), s2 as T(v2,_,l2,r2),lo,hi) =
			concat3(uni_bd(l1,trim(lo,v,s2),lo,v),
				v,
				uni_bd(r1,trim(v,hi,s2),v,hi))
	          (* inv:  lo < v < hi *)

               (*all the other versions of uni and trim are
               specializations of the above two functions with
               lo=-infinity and/or hi=+infinity *)

		fun trim_lo (_ ,E) = E
		  | trim_lo (lo,s as T(v,_,_,r)) =
		        if lt(lo,v) then s else trim_lo(lo,r)
		fun trim_hi (_ ,E) = E
		  | trim_hi (hi,s as T(v,_,l,_)) =
		        if lt(v,hi) then s else trim_hi(hi,l)

		fun uni_hi (s,E,hi) = s
		  | uni_hi (E,T(v,_,l,r),hi) =
		     concat3(l,v,split_lt(r,hi))
		  | uni_hi (T(v,_,l1,r1), s2 as T(v2,_,l2,r2),hi) =
			concat3(uni_hi(l1,trim_hi(v,s2),v),
				v,
				uni_bd(r1,trim(v,hi,s2),v,hi))

		fun uni_lo (s,E,lo) = s
		  | uni_lo (E,T(v,_,l,r),lo) =
		     concat3(split_gt(l,lo),v,r)
		  | uni_lo (T(v,_,l1,r1), s2 as T(v2,_,l2,r2),lo) =
			concat3(uni_bd(l1,trim(lo,v,s2),lo,v),
				v,
				uni_lo(r1,trim_lo(v,s2),v))

		fun uni (s,E) = s
		  | uni (E,s as T(v,_,l,r)) = s
		  | uni (T(v,_,l1,r1), s2 as T(v2,_,l2,r2)) =
			concat3(uni_hi(l1,trim_hi(v,s2),v),
				v,
				uni_lo(r1,trim_lo(v,s2),v))

	    in
		val hedge_union = uni
	    end


	    fun old_union (E,s2)  = s2
	      | old_union (s1,E)  = s1
	      | old_union (s1 as T(v,_,l,r),s2) =
		let val l2 = split_lt(s2,v)
		    val r2 = split_gt(s2,v)
		in
		    concat3(old_union(l,l2),v,old_union(r,r2))
		end

            (* The old_union version is about 20% slower than
               hedge_union in most cases *)

	    val union = hedge_union
	    (*val union = old_union*)

	    val add = add

	    fun difference (E,s)  = E
	      | difference (s,E)  = s
	      | difference (s, T(v,_,l,r)) =
		let val l2 = split_lt(s,v)
		    val r2 = split_gt(s,v)
		in
		    concat(difference(l2,l),difference(r2,r))
		end

	    fun member (x,set) =
		let fun mem E = false
		      | mem (T(v,_,l,r)) =
			if lt(x,v) then mem l else if lt(v,x) then mem r else true
		in mem set end

	    (*fun intersection (a,b) = difference(a,difference(a,b))*)

	    fun intersection (E,_) = E
	      | intersection (_,E) = E
	      | intersection (s, T(v,_,l,r)) =
		let val l2 = split_lt(s,v)
		    val r2 = split_gt(s,v)
		in
		    if member(v,s) then
			concat3(intersection(l2,l),v,intersection(r2,r))
		    else
			concat(intersection(l2,l),intersection(r2,r))
		end

	    fun members set = fold(op::,[],set)

	    fun cardinality E = 0
	      | cardinality (T(_,n,_,_)) = n

	    fun delete (E,x) = E
	      | delete (set as T(v,_,l,r),x) =
		if lt(x,v) then T'(v,delete(l,x),r)
		else if lt(v,x) then T'(v,l,delete(r,x))
		     else delete'(l,r)

	    fun fromList l = List.fold (fn(x,y)=>add(y,x)) l E

	    type  intset = Set

	end
    end

structure IntSet : INTSET =B;

## set.hs
-- http://groups.csail.mit.edu/mac/users/adams/BB/BB.sml
-- http://groups.csail.mit.edu/mac/users/adams/BB/
-- http://t.co/f7hvBEYH3k

{- Implementing Sets Efficiently in a Functional Language
Stephen Adams
CSTR 92-10


1. Introduction
I make a distinction between data types and data structures.
The data type, often referred to as the abstract data type,
is a set of values and the operations provided on those values.

In this paper I will be using a set of integers as the data type,
with operations like union, intersection and testing
for membership.

The data structure is an implementation of the data type
using some underlying data types, perhaps built-in types.

There can be many implementations of the abstract data type, and each
different data structure used for the implementation will have different
properties. These properties are reflected in the efficiency, in both time and
space, of the abstract data type. Some data structures trade the efficiency
of one operation against another.

I have chosen balanced trees for the implementation because they provides a good
all-round performance, with no operation being particularly inefficient.
Particular care has been taken to make the algorithms efficient, while trying
to keep them clear. Program transformation techniques are used to derive
a more efficient union operation by removing redundant operations.

A binary search tree is either empty or is a node with left and right subtrees
and a data item of type Element. In addition we keep in each node a count
of all the nodes in the tree rooted at that node. We declare such a structure
in Haskell like this:
-}
data Tree a = E                   -- Ord a =>
            | T { datum :: a
                , count :: Int
                , leftsubtree  :: Tree a
                , rightsubtree :: Tree a
                }

{-
  Example:
   T "Rachel" 4 (T "Andrew" 2 E
                             (T "Judith" 1 E E))
                (T "Stephen" 1 E E)

The elements in the tree are ordered so that all of the elements stored in the
left subtree of a node are less than the element stored in the node and all
of the elements in the right subtree are greater. Duplicate elements are not
allowed

If duplicates are required then the tree can be used to do this by storing a count of the
duplicates at the node as well as the value.
-}

size :: Ord a => Tree a -> Int
size E = 0
size ( T _ count _ _ ) = count

{-
Note that the following invariant holds for the size of a tree.
For a node
    n = T _ count left right
    size n = count = 1 + size left + size right

We can ensure that this is always the case by using a smart constructor
N in the place of T which calculates the size for the new node:

I use the term smart constructor because the programmer would normally use N in place
of the native constructor T. The distinction between a smart constructor and an ordinary
function, which constructs a solution to a problem, is level of abstraction.

-}

N value left right  = T value  (1 + size l + size r)  left  right

{-
The member function tests whether an element x is in the tree.
The ordering of the elements is used to direct the search.
From the element stored in the current node we can tell if x is
in the left or right subtree.
-}
member :: Ord a => Tree a -> Bool
member x E = false
member x (T value _ left right)
   | x < value = member x left
   | x > value = member x right
   | otherwise true

{-
The minimum value in a binary search tree is
found by following the left subtrees until we reach a node that has an empty
left subtree. The value at that node must be the minimum because all the
values in the right subtree are greater than that value.
-}

min :: Ord a => Tree a -> a
min (T value  _ E _ )   = value
min (T value _ left _ ) = min left
min E = raise Match 2

{-
The last case signals an error if min is applie to an empty tree.
If the case had been omitted then the SML system would have inserted it and given
us a warning that min does not match all trees.

Writing the case explicitly
avoids the warning and, more importantly, signals to someone reading the
program that min is not intended to work for empty trees. 5BoundedBalanceTres

A binary search tree is f-balanced if

1. Its subtrees are -balanced
2. The left and right subtrees satisfy some balancing constraint f.

If a binary search tree is not balanced then it may become degenerate, long
and thin, resembling a listmore than a tree. If, for example, nearly every left
subtree is empty, searches (like the member function) might have to inspect
nearly all of the elements. If the tree is short and bushy then choosing
to go left or right avoids inspecting the large number of elements in the
other branch. Balancing a tree keeps it relatively short and bushy. Trees
are usually balanced to guarantee that a search takes O(logn)operations,
where nis the size of the tree.

Maintaining a balanced tree has two costs:

1. The tree has to be rebalanced after or during operations that add or
remove elements from the tree.
2. If rebalancing is to be efficient it must be done using information local
to a node. This requires that some information is stored at each node.
In a high level language this additional information requires an extra storage
location per node. It makes sense to store additional information that is
useful inimplementing other operations. Bounded Balance treesuse the size
information to keep the tree balanced. The idea is to ensure that one subtree
does not get substantially bigger than the other. The balancing constraint
used in this paper is that one subtree is never more than a constant factor w
larger thanthe other subtree. Other balancing schemes, like height-balanced
trees, AVL trees [2] and Red-Black
3
trees store balancing information that is
otherwise not very useful. 3
For AVL trees and Red-Black trees the balancing information requires only a couple of
bits of storage. It is often suggested that storage may be saved by hiding these bits in the
pointer values used to represent the left and right subtrees. This kind of storage allocation
requires knowledge of the representation of pointers in the machine so it is inherently non-
portable. In high level languages like SML the pointers are hidden completely from the
programmer so this technique is not available.
6 CSTR 92-10
BALANCING MAINTENANCE ALGORITHMS 6.
Nievergelt and Reingold [4] use a slightly different criterion that compares
the size of a subtree to the size of the whole tree. For their purposes,
which included producing analytical results on the behaviour of the tree,
this criterion is cleaner. For our purposes it is simpler to compare the sizes
of the left and right trees.
We abstract away from the balancing scheme we use. To do this it is
necessary to be able to test if two subtrees would make a balanced tree
by looking at the subtrees alone. This is possible if there is a cheap absolute
measure of the tree that can be used to compare the trees, but not possible if
this information is relative to some other part or point in the computation.
AVL trees are an example of this because each node keeps the the height
difference between the two subtrees. Taken in isolation it is not possible to
tell immediately if the two subtrees would make a balanced node. 6Balancingmaintenancealgorithms
We will always be considering balance maintenance with the goal of pre-
venting an unbalanced tree from being created. Smart constructors are
used which build a balanced tree. There is a hierarchy of constructors
which construct balanced trees from progressively out-of-balance subtrees.
1. T(v,n,l,r)
This is the data type constructor.
2. N(v,l,r)
This is the smart constructor that ensures that the size of the tree is
maintained correctly. The tree (v,l,r) must already be balanced.
3. T’(v,l,r)
T’ is used when the original tree was in balance and one of l or r has
changed in size by at most one element, as in insertion or deletion of
a single element.
4. concat3(v,l,r)
concat3 can join trees of arbitrary sizes. As with the other construc-
tors, v must be greater than every element in l and less than every
element in r.
The T’ constructor works by comparing the sizes of the left and right
subtrees. If the sizes of these trees are sufficiently close then a tree is
constructed using the N constructor.
If one subtree is too heavy, (has a size greater than wtimes the other) then
the tree that is built is a rotation of the tree that would be built by N. The
idea of a rotation is to shift part of the heavy subtree over to the side of
the lighter subtree. There are two kinds of rotation: single rotations and
Fig. 1: Single and Double Left-Rotations. a<bc are tree elements; X, Y
and Z are trees of size nx;nyand nzrespectively.
double rotations. These are illustrated in figure 1. Each rotation has two
symmetrical variants. The rotations are expressed very succinctly in SML:
-}

single_L a x           (T b _ y z)  = N b (N a x y)  z
single_R b (T a _ x y) z            = N a x         (N b y z)

double_L a x                       (T c _ (T b _ y1 y2) z) = N b (N a x y1) (N c y2 z)
double_R c (T a _ x (T b _ y1 y2)) z                       = N b (N a x y1) (N c y2 z)

{-
The difference between a single rotation and a double rotation is that a single
(left) rotation moves the entire left subtree of the right side over to the left
side whereas the double rotation only moves the left part of the left subtree.
-}

T’ value left right
   | ln + rn < 2      = N value left righ
   | rn > weight * ln = rightIsTooBig
   | ln > weight *rn  = leftIsTooBig
   | otherwise N p
   where ln = size l
         rn = size r
         rightIsTooBig | rln < rrn = single_L value left right
                       | otherwise = double_L value left right
                       where (T _ _ rl rr) = right
                             rln = size rl
                             rrn = size rr
         leftIsTooBig  | lrn < lln = single_R value left right
                       | otherwise = double_R value left right
                       where (T _ _ ll lr) = left
                             lln = size ll
                             lrn = size lr

{-
7. INSERTION AND DELETION
The difficult part of implementing T’ is deciding on what rotations, if any,
to use. A careful analysis of the possible arguments leads directly to the
solution given in figure 2.

1. In the trivial case where the left and right trees are empty the tree will
be balanced, so T’ should call N.

2. If one subtree is empty and the other contains only one element the
tree cannot be balanced, so N is called.

Both of these last cases can be tested in one test: ln+rn < 2. As N
takes the same type of parameter as T’ I have used a layered pattern,
‘p as (v,l,r)’ at the top of T’. SML functions take exactly one
argument. Often this is a tuple of values, like (v,l,r). Layered
patterns allow the whole pattern to be named as well as its subparts.
In T’, p is bound to the tuple of parameters
4
. N can be called directly
with the same parameters like this: N p. The same idea is used later
with the rotation functions.
3. If neither tree is too heavy then N is called. This case is determined in
figure 2 by falling through to the final else clause.
4. If the right tree is too heavy (has more than wtimes the number of
elements than the left tree) then a balanced tree must be constructed
by moving part of the right tree to the left side. If the outer subtree
on the right side is smaller that its sibling then a single rotationmight
leave the tree unbalanced. Sowe do a double rotation if the inner right
subtree is the larger and a single otherwise.
For some values of wis is not possible to build a balanced tree, for
example w=1implies that the tree is always perfectly balanced,
which is impossible.
5
The choice of a value for wis discussed in
Appendix A. Here it is sufficient to say that >3:75ensures that it
is possible to create a balanced tree. It is convenient to use integral
values for w, so is chosen such that w4.
5. The case for a heavy left tree is symmetrical to the right tree.
We will return to discuss concat3 in Section 9. 7Insertionanddeletion
The conventional terms ‘insertion’ and ‘deletion’ are a bit of a misnomer
in functional programming, since referential transparency prevents us from
altering the tree. The functional analogue to the imperative concept of 4
That is why I called it p 5
Execpt, of course, for trees containing exactly 2n􀀀1elements for some n.
10 CSTR 92-10
INSERTION AND DELETION 7.
insertion is to return a new data structure that looks like old one except that
it also contains the ‘inserted’ element.
Insertion works by finding a place on the fringe of the tree to put the new
element. The place is determined by the ordering of the elements. The
new element must be placed between the highest value less than it and the
lowest value greater than it. This position is found by searching for the new
element, then a new tree is constructed which has the new element in this
place.
This is the implementationof add. The functionapplicationadd(aTree,x)
returns a new tree which contains x:
-}

add E x = (T x 1 E E)
add tree@(T v _ l r) x | x < v = T’ v (add l x) r
                       | v < x = T’ v l (add r x)
                       | otherwise = tree
{-
The first case handles an empty tree and returns a tree containing one
element. This case is used as a base case for the recursion in the second case.
The second case navigates through the tree much like member, deciding to
add the new element to either the left subtree or the right subtree. If, in the
final analysis, the tree already has the new element at the root then there
is no point in inserting it, to the original tree which already contains x is
returned. It is more useful to change the last line to:
else N(x,l,r)
This is more useful for when the tree contains elements which have infor-
mation that is not used in the lt comparison, for example database records
that are indexed on only one field. With this change, adding an element that
already occurs in the tree but has different associated data has the effect of
‘updating’ the element’s associated data. However, it does cost a little extra
in storage and time to build the new node using N rather than reusing the
old one.
It is interesting to note that the algorithm is the same as for inserting in
an unbalanced tree, except we use the rebalancing constructor T’ is used
rather than the plain one N.
Deletion, or rather, computing a tree which looks like the old one but has
a selected element missing, is a little harder. The basic idea is to navigate
down the tree to find the element to delete. If the tree is empty then the job
is trivial—there is nothing to delete. If the element is to the left or right then
it is deleted from that subtree and the tree is rebuilt using the rebalancing
constructor. The difficulty comes when the element is at the root of the
current subtree, so that case is left to an auxiliary function, delete’:
-}

delete E x = E
delete (T v _ l r) x | x < v = T’ v (delete l x) r
                     | v < x = T’ v l (delete r x)
                     | otherwise delete’ l r
{--
The auxiliary function delete’ must build a tree containing all the el-
ements of l and r. This could be done easily if there was an element
of the right value available to use because we could then join l and r
using N. This element is found by taking the minimum element from r.
Note that the rebalancing constructor T’ is used rather than N because
delete(r,min_elt) is one element smaller than r, and so might not
balance with l. There are also two special cases for when either l or r is
empty. These cases can simply return the other tree.
-}

delete’  E r = r
delete’  l E = l
delete’  l r  = let min_elt = min r
                in T’ min_elt l (delete r min_elt)

{-
The efficiency of this code can be improved. We know that min_elt is the
minimum element in r, so min_elt must be in r, so delete(r,min_elt) will never use the
first case of delete. min_elt must be at the leftmost node in the tree r.
It is considerably more efficient to use a specialized function since no
comparison is necessary. The improved algorithm is given below, with
another auxiliary function which deletes the minimum element in a tree.
This function, delmin, is structured like min and is useful in its own right,
for example, in implementing priority queues. Since min_elt is only used
in one place we dispense with the let.
-}
delete’ (E,r) = r
delete’ (l,E) = l
delete’ (l,r) = T’(min r,l,delmin r)

delmin (T(_,_,E,r)) = r
delmin (T(v,_,l,r)) = T’(v,delmin l,r)
delmin _ = raise Match
{-
Again it is worth noting that this deletion algorithm is the same as for
unbalanced trees except that T’ is used to ensure balance.

PROCESSING A TREE 8. 8Procesingatre
Insertion, deletion and membership tests (including lookup) allow trees to
be used in a variety of applications, for example as a database. Sometimes
it is useful to process all the elements in a tree, for example, to print them
or sum them.
There are many ways of processing the elements in a tree. They may be
processed from left to right or from right to left. The element at a node
may be processed before, in-between or after processing the element in the
subtrees—giving pre-order, in-order and post-order traversals respectively.
It is a good idea to capture all of this detail in a function. The SML
standard environment and provides a function called fold for combining
the elements of a list to build a result.
6
We copy this idea and present
inorder_fold, which combines the elements from right-to-left and in-
order:
fun inorder_fold(f,base,tree) =
let fun fold’(base,E) = base
| fold’(base,T(v,_,l,r)) = fold’(f(v,fold’(base,r)),l)
in
fold’(base,tree)
end
Other traversals can be built by changing the order of the calls to f and
fold’. This traversal is in-order because the call to f is between the two
calls to fold’, and it is right to left because r is innermost, being processed
before v and then l.
One simple use of inorder_fold is to make a list of all the elements in the
tree.
fun members tree = inorder_fold(op::, [], tree)
This functionworks by startingwith anempty list ([]) andworking through
the tree adding the elements on to the front of the listwith the list constructor
operator ::. As inorder_fold performs a right-to-left in-order traversal
the elements in the list are in order according to the ordering relation <. A
tree-sort can be constructed by converting the list into a tree and back again:
fun reverse_add (element,tree) = add(tree,element)
fun tree_from_list aList = fold reverse_add aList E
fun treesort aList = members (tree_from_list aList) 6
The list fold function in NJ-SML’s standard environment has the type
val fold : (’a * ’b -> ’b) -> ’a list -> ’b -> ’b
CSTR 92-10 13
9. SETSET!SET OPERATIONS
The sort takes time O(nlogn)which is, within the constant factor, as fast
as is possible using only an ordering relation [1]. Note that reverse_add
is necessary because the arguments for add are in the wrong order for the
argument f of inorder_fold.
A interesting decision in the design of inorder_fold is to make it have
the type
val inorder_fold : (Element * ’a -> ’a) * ’a * Tree -> ’a
Usually a fold-like function is written to be isomorphic to the structure
which is is applied to:
val treefold : (Element * ’a * ’a -> ’a) * ’a * Tree -> ’a
fun treefold (f,base,E) = base
| treefold (f,base,T(v,_,l,r)) =
f(v, treefold(f,base,l), treefold(f,base,r))
This function can be thought of as substituting base for every empty tree
and f for every node. f takes an element and the results of the computations
from both subtrees. This is analogous to the list fold which substitutes a
function for every cons cell and a value for the single empty list [] at the
end.
If fold-like functions are isomorphic to the data structure then each fold-like
function takes parameterswith different types. When the data type is being
used as a set or table this becomes poor for abstraction as the user of the
data type has to know how it is implemented, e.g. as a list or a binary tree
or some other structure
7
. As all the programmer wants is to collect all the
elements together in a computation it makes sense to make several fold
functions like inorder_fold which all take the same kinds of arguments. 9SetSet!Setoperations
In this section we develop efficient functions for combining two trees. The
operations are union, intersection and difference.
The union operation (A[B) is discussed in most detail as an exemplar of
the principles involved. Asymmetric set difference (A􀀀B) is described as
its implementation has some important differences to union. Intersection is
trivial since it can be defined in terms of difference, like this: A\B=A􀀀(􀀀B)
However, as it is more efficient to code intersection directly, we do this too.
We already have a method for processing the elements stored in trees. We 7
Things can get even worse—both binary trees and lists have only two constructors.
Other data structures have more, requiring more functions as parameters—for example
2-3-4 trees.
14 CSTR 92-10
SETSET!SET OPERATIONS  HHHj 9. T1T2
T1 􀀀AA􀀀Av A
@@@@AAA B
split(T2) 􀀀AA􀀀u C
@@@@AAAA D
<v
>| {z }|{z}
C
0
D
? HHHHHHj ?0
AC
0
BD
PPPPPq )0
combine(AC
0
, v, BD
0
)
Fig. 3: Divide and conquer scheme for T1T2.
CSTR 92-10 15
9. SETSET!SET OPERATIONS
could use inorder_fold to work through the second tree adding all of
its elements to the first tree. The result is a new tree containing all of the
element of both trees:
fun fold_union t1 t2 = inorder_fold(reverse_add,t1,t2)
This function is very elegant but not very efficient. n) It takes timeO(mlogm+ where nand mare the number of elements in t1 and t2 respectively.
The problemis that each element of t2 is inserted in a big tree, incurring the O(logm+)cost. The key to a more efficient version is to keep the trees
small when inserting and to combine large trees efficiently. Fortunately it is
possible to combine large trees very efficiently under certain conditions. 9.1Divideandconquer
The efficiency of the operation depends on the size of the operands. Reduc-
ing the size of the operands increases efficiency. One strategy for reducing
the size of the operands is Divide and Conquer.
Divide and Conquer works by breaking a big problem into small problems,
solving the small problems, and then combining the solutions to the small
problems to get the solution to the big problem. The small problems are
usually just smaller versions of the big problem and are solved in the same
way. Eventually the small problems must be so small that there is another
easy way to solve them, so the dividing terminates.
Operations involving a single tree have an obvious structure on which to
base the division—two subtrees
8
. The strategy for operationswith two trees
is to pick one tree to guide the control and to force the other tree into the
right shape by cutting it into pieces and building two trees that are suitable
to accompany the subtrees of the control tree. This is illustrated in figure 3. 9.2Union
This is how union is implemented using the divide and conquer strategy:
fun union (E,tree2) = tree2
| union (tree1,E) = tree1
| union (tree1, T(v,_,l,r)) =
let val l’ = split_lt(tree1,v)
val r’ = split_gt(tree1,v)
in
concat3(v, union(l’,l), union(r’,r))
end
The first two cases handle the trivial problem of combining a tree with an
empty tree. The third case implements the divide and conquer steps. 8
treefold is a simple divide and conquer algorithm.
16 CSTR 92-10
UNION 9.2 split_lt and split_gt return a tree containing all those elements
in the original treewhich are less than (or greater than) the cut element v. l’ and r’ corresponds to C
0
and D
0
in figure 3. concat3 joins to trees using an element. It is the final member of the
smart constructor hierarchy described in section 6. The second parameter is used as the control tree. The reason for this is
that if both trees contain the same element then the final tree has the
copy that originated from the control tree, matched against v. When
a tree is used as a map union behaves like the map override operator 9
. This reason is the same as for the choice of the last case for the add
on page 11, except that it costs no more than using the first parameter
to guide the control. When the tree is to implement a set this does not
matter. split_lt and split_gt can be implemented to take time O(logn)
in the size of the tree. concat(v,l,r) can be implemented to take time O(logn􀀀logm)
where the larger of l and r has size nand the smaller has size m. The entire union operation takes worst case time O(n+)where n
and mare the sizes of the trees being combined. In cases where one tree is small or the trees have dense regions that
do not overlap the operation is considerably faster.
The function concat3 is used to join two trees with an element that is
between the values in the left tree and the values in the right tree. If the left
and right arguments would make a balanced tree then they can be joined
immediately. If one tree is significantly larger then it is scanned to find the
largest subtree on the side ‘facing’ the smaller tree that is small enough to
balance with the smaller tree. The tree is joined at this position and the
higher levels are rebalanced if necessary.
fun concat3 (v,E,r) = add(r,v)
| concat3 (v,l,E) = add(l,v)
| concat3 (v, l as T(v1,n1,l1,r1), r as T(v2,n2,l2,r2)) =
if weight*n1 < n2 then T’(v2,concat3(v,l,l2),r2)
else if weight*n2 < n1 then T’(v1,l1,concat3(v,r1,r))
else N(v,l,r) 9
defined as (fg)(x)=g(x)if g(x)is defined =fotherwise
CSTR 92-10 17
9. SETSET!SET OPERATIONS
When the trees are nearly the same size, then concat3 does very littlework.
A tree is split by discarding all the unwanted elements and subtrees, and
joining together all the wanted parts using concat3.
fun split_lt (E,x) = E
| split_lt (T(v,_,l,r),x) =
if lt(x,v) then split_lt(l,x)
else if lt(v,x) then concat3(v,l,split_lt(r,x))
else l
split_lt takes time O(logn). At first it might be expected to take time O(log2n)as each of the recursive calls might call concat3 which
might take logarithmic time itself. This does not happen because the order
of the calls to concat3 ensures that the small trees are joined together
before being joined to the larger trees. So concat3 is usually called with
comparably sized trees, and if concat3 is called with trees of greatly
differing size this is compensated by the fact that it is called less often.
The running time of union is at worst O(n+m). The result follows from
the observation that at each node the operation takes time logarithmic in the
size of the tree at that node. As the number of nodes increases exponentially
with the logarithm of the size of the tree, this is the dominant factor, so the
logarithmic operation of split_lt (and in some circumstances concat3)
does not affect the order of the computation. There are so many more
computations on small trees that are cheaper that is the dominant factor.
The running time of union is better for fortuitous inputs, for example,
similar sized disjoint ranges or trees which differ greatly in size. 9.3Dierenceandintersection
Asymmetric set difference also uses the divide and conquer strategy to
achieve a linear time behaviour. The main difference comparedwith union
is that difference must exclude certain values from the result. This
is achived by making the second argument guide the control flow. Each
element in the second argument is excluded because it does not appear in
the split parts of the first tree and it is not included in final expression. This
requires a function, concat, to concatenate two trees.
fun difference (E,s) = E
| difference (s,E) = s
| difference (s, T(v,_,l,r)) =
let val l’ = split_lt(s,v)
val r’ = split_gt(s,v)
in
concat(difference(l’,l),difference(r’,r))
end
18 CSTR 92-10
HEDGE UNION 10.
concat is easily coded in terms of concat3. As concat does not have the
benefit of a ‘glue element’ one may be obtained by removing it from one of
the parameters. This is much like how delete’ worked:
fun concat (t1, E) = t1
| concat (t1, t2) = concat3(min t2, t1, delmin t2)
A slight improvement is to postpone the calls to min and delmin until the
last possible moment. Then these functions operate on smaller trees. The
rewritten concat then looks like concat3:
fun concat (E, t2) = t2
| concat (t1, E) = t1
| concat (t1 as T(v1,n1,l1,r1), t2 as T(v2,n2,l2,r2)) =
if weight*n1 < n2 then T’(v2,concat(t1,l2),r2)
else if weight*n2 < n1 then T’(v1,l1,concat(r1,t2))
else T’(min t2,t1, delmin t2)
We can expect difference to be a little slower than union because
concat always takes time O(logn).
A simple version of intersection relies on the identity A\B=􀀀(B􀀀A).
The elements are removed from Bso that the elements remaining come
from Bfor uniformity with union.
fun intersection (a,b) = difference(b,difference(b,a))
This calls difference twice. It is faster to code intersection using
divide and conquer like the previous operations:
fun intersection (E,_) = E
| intersection (_,E) = E
| intersection (s, T(v,_,l,r)) =
let val l’ = split_lt(s,v)
val r’ = split_gt(s,v)
in
if member(v,s) then
concat3(v,intersection(l’,l),intersection(r’,r))
else
concat(intersection(l’,l),intersection(r’,r))
end
The intersection contains only members occuring in both trees, so it is
necessary to test the membership of v in the first tree. 10Hedgeunion
There is room for improvement in the performance of union. There is
some inefficiency in the divide-and-conquer framework. At each level of
CSTR 92-10 19
10. HEDGE UNION
recursion a tree is split into two subtrees, possibly leaving an element left
over. This costs O(logn) time and space. At the next level of recursion the
freshly constructed subtrees are split again. In this section we develop
hedge_union which improves on the absolute efficiency of union by
avoiding some of this cost.
Each of the subtrees produced by the splitting contains a subrange of the
values in the original tree. We say that this set is the original set restricted to
(i.e. intersected with) the subrange (low;high). It is convenient to think of
the original tree as being ‘restricted’ by the subrange (􀀀1;+1).
Instead of splitting the tree, we can just make a note of the subrange that
restricts the tree. The triple (tre;lowhigh)is used instead of the subtrees
produced by the splitting. The splitting isdeferred until the control tree (first
argument of union) is empty, when we use split_lt and split_gt to
extract the valid subrange. The result is union’:
fun union’ (E,(s2,lo,hi)) =
split_gt(split_lt(s2,hi),lo)
| union’ (T(v,_,l1,r1), (s2,lo,hi)) =
concat3(v,
union’(l1,(s2,lo,v)),
union’(r1,(s2,v,hi)))
The astute reader will notice several problems with union’. Most impor-
tant is that it runs in time O(nlogn)which isworse than union’s O(n)time.
This is because for every one of theO(nempty subtrees in the first argument
the entire second argument is split atlogn)cost (remember union runs in O(n)time because the vastmajority of the splitting operations are performed
on small trees).
A second problem with union’ is that, as s2 is always passed to the
recursive calls unaltered, there is little point checking that it is empty. The
operation BigSet[SmalSettakes longer BigSet to compute than SmalSet[ . The behavioural transparency of the algorithm has been lost.
Finally, the base case calls split_gt on the result of split_lt. A more
efficient solution would be to implement the combined operation, but in
fact this problem goes away when the previous two are solved.
The key observation is that if a tree is rooted at a node with value outside
of the restricting range then only one of the subtrees can intersect that
range. That subtree should be passed instead of the whole tree. We
introduce the invariant that either the tree is empty or the root of the tree
lies within the subrange. The invariant is established for any tree and
subrange by descending the tree as in figure 4. The full implementation of
hedge_union
10
is given in figure 5. 10
The reason for thsi name is now apparent: the tree is bounded by the ‘hedges’ lowand highbut these bounds are soft as bits of the tree may poke through them.
20 CSTR 92-10
HEDGE UNION aPPPPPPP f 10. A dPPPPHHHA
􀀀A􀀀b􀀀􀀀@A􀀀@􀀀􀀀c􀀀@􀀀@􀀀A@@􀀀A@􀀀@Ae@@A
>low <high
Fig. 4: Lazy subrange intersection. If the tree at arestricted to the exclusive
range (low;high)is equivalent to the tree at drestricted to the same range.
The nodes aand f, and their left and right subtrees respectively may be
discounted because they do not overlap the range (low;high)and cannot
overlap any subrange of (low;high).
CSTR 92-10 21
11. ADDITIONAL OPERATIONS: RANK AND INDEXING
The trim function returns a (sub)tree satisfying the invariant. The heart
of the algorithm is uni_bd, which cures the ills of union’. The first base
case tests whether the second parameter is empty. This is worthwhile again
because the invariant ensures that this parameter reduces in size in at least
one of the recursive calls. In the second base case of uni_bd , when the
first tree is empty, it is no longer necessary to call split_gt on the result of
split_lt as we already know that v is between lo and hi. The subtrees
can be trimmed as appropriate.
The algorithmstartswith the range (􀀀1;+1). It is undesirable to program
this explicity for a number of reasons. If values are chosen to represent the
infinities, the algorithm will fail when those values appear in a set. There
may be no such value, for example, it is always possible to find a greater
alphabetic string by appending a ‘z’: ‘z’ <‘zz’ <‘zzz’ etc.
The solution used in figure 5 is specialise to the functions trim and uni_bd
for the caseswhen one or both of hi =+1and lo =􀀀1. This accounts for
all of the additional functions. Specialising a function is simple, if tedious: Copy the function and give it a new name. Remove the parameter, say lo. Subsititute 􀀀1for lo in the body of the function. Use the properties of 􀀀1, like 􀀀1|< lo| is alwats true, to remove
all references to it.
The case where both hi and lo are infinities has the correct signature to
replace union. Hedge versions of set difference and intersection may be
derived in the same way.
The run time of hedge_union is O(n), like union, but it is faster than
unionbecause it does fewer operations: the splitfunctions are called only
to build subtrees that are necessary. Test runs indicate that hedge_union is
usually 20%faster than union. In theworst case, SmalSet[BigSet, it runs
in about the same time. Whether this is worth the additional complexity
depends on the application. Small constant factors are only worth this
degree of effort when the program is a commonly used part of a library. 1Aditionaloperations:rankandindexing
Each tree node contains a count of the elements in the tree rooted at that
node. This was justified in section 3—the information is used to ensure that
only balanced trees were built, and is useful in its own right. Without the
count data it would take O(n)time to count the number of elements by
traversing the tree instead of constant time.
The count data can also be used to determine the rank of an element and to
retrieve an element by its rank. The elements are ranked according to the
22 CSTR 92-10
ADDITIONAL OPERATIONS: RANK AND INDEXING 11.
fun trim (lo,hi,E) = E
| trim (lo,hi,s as T(v,_,l,r)) =
if lt(lo,v) then
if lt(v,hi) then s
else trim(lo,hi,l)
else trim(lo,hi,r)
fun uni_bd (s,E,lo,hi) = s
| uni_bd (E,T(v,_,l,r),lo,hi) =
concat3(v,split_gt(l,lo),split_lt(r,hi))
| uni_bd (T(v,_,l1,r1), s2 as T(v2,_,l2,r2),lo,hi) =
concat3(v,
uni_bd(l1,trim(lo,v,s2),lo,v),
uni_bd(r1,trim(v,hi,s2),v,hi))
(* inv: lo < v < hi *)
(*all the other versions of uni and trim are
specializations of the above two functions with
lo=-infinity and/or hi=+infinity *)
fun trim_lo (_ ,E) = E
| trim_lo (lo,s as T(v,_,_,r)) =
if lt(lo,v) then s else trim_lo(lo,r)
fun trim_hi (_ ,E) = E
| trim_hi (hi,s as T(v,_,l,_)) =
if lt(v,hi) then s else trim_hi(hi,l)
fun uni_hi (s,E,hi) = s
| uni_hi (E,T(v,_,l,r),hi) =
concat3(v,l,split_lt(r,hi))
| uni_hi (T(v,_,l1,r1), s2 as T(v2,_,l2,r2),hi) =
concat3(v,
uni_hi(l1,trim_hi(v,s2),v),
uni_bd(r1,trim(v,hi,s2),v,hi))
fun uni_lo (s,E,lo) = s
| uni_lo (E,T(v,_,l,r),lo) =
concat3(v,split_gt(l,lo),r)
| uni_lo (T(v,_,l1,r1), s2 as T(v2,_,l2,r2),lo) =
concat3(v,
uni_bd(l1,trim(lo,v,s2),lo,v),
uni_lo(r1,trim_lo(v,s2),v))
fun hedge_union (s,E) = s
| hedge_union (E,s2 as T(v,_,l,r)) = s2
| hedge_union (T(v,_,l1,r1), s2 as T(v2,_,l2,r2)) =
concat3(v,
uni_hi(l1,trim_hi(v,s2),v),
uni_lo(r1,trim_lo(v,s2),v))
Fig. 5: Hedge union
CSTR 92-10 23
12. SUMMARY
ordering relation <. The minimum element has rank 0, the next smallest
has rank 1 and so on. By basing the rank on 0, the rank of an element
is simply the number of elements to the left of that element. The rank is
computed by summing the sizes of all the left-subtrees not taken on the path
to the element. If the element doesn’t appear in the tree then a Subscript
exception is signalled.
exception Subscript
fun rank (E,x) = raise Subscript
| rank (T(v,n,l,r), x) =
if x<v then rank(l,x)
else if x>v then rank(r,x) + size l + 1
else size l
Indexing uses the size information to navigate to the element. At each node
the index is checked against the size of the left subtree. If the index is
smaller than the size of the subtree then the element must be somewhere
in that subtree. If it is greater then it must be in the right subtree, but the
number of elements in the left subtree and the one at current node must be
discounted first. As always, if the element is in neither subtree then it must
be at the current node.
fun index (E,_) = raise Subscript
| index (T(v,_,l,r), i) =
let val nl = size l
in
if i<nl then index(l,i)
else if i>nl then index(r,i-nl-1)
else v
end 12Summary
Very little in this report is new. Balanced binary search trees have been
around for a long time. In particular, the methods of analysis and results,
(O(logn)insertion, O(n)union etc., are long established facts. This presen-
tation, however, has several novel features: the functional programming style the abstraction away from the balancing algorithm the hedge_union algorithm
From this exercise we conclude that
24 CSTR 92-10
SUMMARY 12. It is nearly as easy to implement balanced trees as unbalanced trees if
the concept of ‘rebalancing constructors’ is taken on board. Rebalancing constructors need an absolute measure of the size or
height of the tree. AVL trees which encode a height difference between
left and right subtrees are not easy to code because the rebalancing
depends on the change in height rather than an absolute measure. Bounded Balance binary trees aremore useful thanother type of binary
tree. All the balanced tree schemes have logarithmic insert and delete, so
when comparing them the constant factor is very important. One
thing that affects the constant factor is the size of the node—every
node has to be initialized and competes for space, putting presure on
the memory allocation system. Small is beautifull.
The program described here was written in response to a competition put
forward by Andrew Appel. Without this spur I would never have taken the
time to finish the program, or write this report. I would like to thank Andy
Gravell for the valuable discussions on the content and presentation of this
report.
CSTR 92-10 25
A. BALANCE MAINTENANCE CONSTRAINTS ABalancemaintenanceconstraints
This section derives the constraints on the parameters used to maintain
a balanced tree. The parameters are wand . wis the bounded balance
criterion, themaximumfactor bywhich one subtree canoutweigh its sibling.
A tree is balanced if and only if either
1. Both subtrees have one or no elements, or
2. The number of elements in a subtree does not exceed wtimes the
number of elements in the other subtree, i.e.  sisziezer(ilgefhtt))wwssiizzee((rleifghtt) is a decision variable. We will consider the case where a tree has been
altered by the addition or deletion of a single element. Given a node which
has two balanced subtrees, but the subtrees fail to satisfy the criteria (2) we
will choose to apply a single or double rotation to restore balance. We choose
a single or double rotation depending on the relative size of the subtrees of
the heavier subtree of the unbalanced node. measures the actual factor
by which the outermost subtree exceeds the weight of the inner subtree. As
this subtree is balanced, 1=ww.
In the rest of this section is devoted to investigating relationship between w
and . We wish to know for what values of wdo the tree rotations restore
balance and what values of should be used to choose a single or a double
rotation. Only the case of the right subtree being one element too heavy is
considered. This case is the same as the left tree being too light by one, and
the other cases are generated by symmetry.
a xA b 􀀀nA􀀀@A􀀀@@n@+AnAAn+1 - Single Rotation x+n+1a xAA b nAAnA 􀀀 @@􀀀 􀀀 @
Fig. 6: Single rotation.
26 CSTR 92-10
SINGLE ROTATION A.1 A.1Singlerotation
The transfer of elements in a single left rotation is characterized by figure 6.
Before rotation the tree is out of balance at node a, so n++1=wx+1
giving x=w+1n
We require that the rotated tree is balanced. Each each node the has to be
balanced, with neither the left subtree nor the right subtree too heavy. This
leads to the following constraints:
1. xwn
2. nx
3. x++1wn
4. nw(x++1)
Rearranging to constrain in terms of wyields:
1. w2􀀀1
2. 0, which is ignored because it is weaker than 1=ww.
3. (2w+1)=(w2􀀀1)
4. no constraint on ; 0
Some constraints, like case 3, express a constraint on in terms of wand n. Since ncan vary we pick the strongest constraint for any positive whole
value of . Case 3 is reduced as follows: x+n+1wn
Substituting for xand rearranging gives n(nww+21􀀀)+1)w
If n=1this reduces to 2ww2􀀀+11
As n!1the single win the numerator becomes irrelevant, so ww2+􀀀11
The constraint for n=1is the stronger so it is chosen.
CSTR 92-10 27
A. BALANCE MAINTENANCE xA CONSTRAINTS a y=wn+nn+AA1b wAnAc 􀀀􀀀􀀀􀀀@􀀀@@􀀀@@y@+zAzA+1 - Double Rotation xAAa x+nAAnA+1b wAnAc 􀀀@@H􀀀􀀀@w@nzA+Az+1
xA Fig. 7: Double rotation case (a). a y=wn+nw+AnA1b nAAc 􀀀􀀀􀀀􀀀@􀀀@@􀀀@@y@+zAzA+1 - Double Rotation xAAa xw+AnAwAn+1nAAc 􀀀@@H􀀀􀀀@n@+zAAzA+1
Fig. 8: Double rotation case (b). A.2Doublerotation
There are two cases to be considered. The inner tree at b may be right-heavy
(figure 7) or left-heavy (figure 8). These two cases are sufficient as it is our
concern that the rotation will fail to balance the tree by leaving too little or
adding too much at aand cin the final tree.
For the first case constraints are
1. xwn, which reduces to n(wn2(􀀀+w1􀀀)+1)1+1
28 CSTR 92-10
THE (w;)SPACE A.3
As the numerator is quadratic in w, we have to choose the strongest
constraint (n=1or n!1) depending on w: w(􀀀1)=(w+2) if 1+p3(when n=1) (2􀀀+1) if w1+(when !1)
2. nwxtrivially yields 0
3. wnzyields 1=(w+1)
4. zwwnyields 2=(+2)
5. x+n+1w(wn+z+1)yields 􀀀(w2􀀀􀀀1)=(w2􀀀1)which
is subsumed by 0for w2􀀀􀀀10, i.e. 1:6181
6. wn+x+1w(x+n+1), which does not constrain .
For the second case the constraints are calculated as for the first case. There
is only one case that is not weaker than other constraints that we have
already encountered:
7. A.3The(wz;w)spna, cwehich yields ww+2
The graph in figure 9 illustrates the constraints that affect a practical
1
choice of wand .
The graph may be interpreted as follows Single rotations restore balance in the clear region above the dashed
line. Double rotations restore balance in the clear region below the dotted
line. In the shaded region it is not possible to balance the tree. There are
two reasons:
1. There are no trees in the region above the line =wand below =1=wbecause the right subtree must already be balanced.
2. In the remainder of the shaded region both single and double
rotations fail to restore balance. To guarantee that it is possible to restore balance wmust be chosen to
the right of the ‘cusp’ at (wc;c)where wc3:745and c0:652. 1
Double rotation constraint 1 would nip the corner off the clear area near (2,
12) but that
does not affect the choice.
CSTR 92-10 29
A. BALANCE MAINTENANCE CONSTRAINTS
0
0.5
1
1.5
2
2.5
w 0 2 4 6
w1 w
2ww2􀀀+11(wc;c) ww+2
(4.646,
12)
Fig. 9: The (w;)parameter space. Values of wand should be chosen
to lie in the region below the dotted curve ==(+2)and above the
dashed curve =(2w+1)=(w2􀀀1).
30 CSTR 92-10
THE (w;)SPACE A.3 If w=wcthen the rebalancing algorithmmust calculate for the tree.
If this <cthen a double rotation must be applied, otherwise a
single rotation must be applied. If a more convenient =12is used this constrains us to use w>4:646
CSTR 92-10 31
BIBLIOGRAPHY BAsimplewaytoreasonaboutthetres
The analysis presented in the previous section is detailed and difficult. This
appendix notes a less rigorous and easier way to reason about bounded
balance trees.
The important factor determining the speed of operations on a tree are The maximum path length from the root node to an element. Balanc-
ing ensures that this height is logarithmic in the size of the tree. The extra cost of operations to detect and correct an imbalance.
A rotation is used to build a tree when otherwise the tree would be
unbalanced. The criterion that no tree should have more than wtimes
the number of elements than its sibling is roughly equivalent to saying that,
since the height of a tree is logarithmic in its size, one tree should never be
more than a fixed amount higher than its sibling. The rotations (figure 1)
must be applied to lift the larger (hence taller) trees at the expense of the
smaller trees. This is exactly what T’ does.
Both analytical and empirical evidence ([4],[3]) suggests that there is little
to choose between various balancing schemes, and that balancing is unnec-
essary on random data. Balancing is only necessary to prevent poor worst
case behaviour and the evidence suggests that almost any scheme to avoid
poor behaviour in the pathological cases will produce acceptable results. Bibliography
[1] Aho, Hopcroft & Ullman, 1974. The Design and Analysis of Computer
Algorithms. Addison-Wesley.
[2] Adel´son-Vel´skii, G. M. and Y. M. Landis, 1962. “An algorithm for the
organization of information”, Dokl. Akad. Nauk SSSR 146, 263–266 (in
Russian). English translation in Soviet Math. Dokl. 3, 1962, 1259–1262.
[3] Guibas, L. J. and R. Sedgewick, 1978. “A dichromatic framework for
balanced trees”, Proc. 9thACMSymposiumon the Theory of Computing,
49–60.
[4] Nievergelt, J. and E. M. Reingold, 1973. “Binary search trees of bounded
balance”, SIAM J. Computing 2(1), March 1973.
32 CSTR 92-10
	(*

	http://groups.csail.mit.edu/mac/users/adams/BB/BB.sml

	Copyright 1992-1996 Stephen Adams.

	This software may be used freely provided that:
	1. This copyright notice is attached to any copy, derived work,
	or work including all or part of this software.
	2. Any derived work must contain a prominent notice stating that
	it has been altered from the original.

	*)

	(* Address: Electronics & Computer Science
	University of Southampton
	Southampton SO9 5NH
	Great Britian
	E-mail: sra@ecs.soton.ac.uk

	Comments:

	1. The implementation is based on Binary search trees of Bounded
	Balance, similar to Nievergelt & Reingold, SIAM J. Computing
	2(1), March 1973. The main advantage of these trees is that
	they keep the size of the tree in the node, giving a constant
	time size operation.

	2. The bounded balance criterion is simpler than N&R's alpha.
	Simply, one subtree must not have more than `weight' times as
	many elements as the opposite subtree. Rebalancing is
	guaranteed to reinstate the criterion for weight>2.23, but
	the occasional incorrect behaviour for weight=2 is not
	detrimental to performance.

	3. There are two implementations of union. The default,
	hedge_union, is much more complex and usually 20% faster. I
	am not sure that the performance increase warrants the
	complexity (and time it took to write), but I am leaving it
	in for the competition. It is derived from the original
	union by replacing the split_lt(gt) operations with a lazy
	version. The `obvious' version is called old_union.
	*)

	structure B (: INTSET) =
	struct

	local

	type T = int
	val lt : T*T->bool = op <

	(* weight is a parameter to the rebalancing process. *)
	val weight:int = 3

	datatype Set = E \| T of T * int * Set * Set

	fun size E = 0
	\| size (T(_,n,_,_)) = n

	(fun N(v,l,r) = T(v,1+size(l)+size(r),l,r))
	fun N(v,E, E) = T(v,1,E,E)
	\| N(v,E, r as T(_,n,_,_)) = T(v,n+1,E,r)
	\| N(v,l as T(_,n,_,_),E) = T(v,n+1,l,E)
	\| N(v,l as T(_,n,_,_),r as T(_,m,_,_)) = T(v,n+m+1,l,r)

	fun single_L (a,x,T(b,_,y,z)) = N(b,N(a,x,y),z)
	\| single_L _ = raise Match
	fun single_R (b,T(a,_,x,y),z) = N(a,x,N(b,y,z))
	\| single_R _ = raise Match
	fun double_L (a,w,T(c,_,T(b,_,x,y),z)) = N(b,N(a,w,x),N(c,y,z))
	\| double_L _ = raise Match
	fun double_R (c,T(a,_,w,T(b,_,x,y)),z) = N(b,N(a,w,x),N(c,y,z))
	\| double_R _ = raise Match

	fun T' (v,E,E) = T(v,1,E,E)
	\| T' (v,E,r as T(_,_,E,E)) = T(v,2,E,r)
	\| T' (v,l as T(_,_,E,E),E) = T(v,2,l,E)

	\| T' (p as (_,E,T(_,_,T(_,_,_,_),E))) = double_L p
	\| T' (p as (_,T(_,_,E,T(_,_,_,_)),E)) = double_R p

	(* these cases almost never happen with small weight*)
	\| T' (p as (_,E,T(_,_,T(_,ln,_,_),T(_,rn,_,_)))) =
	if ln<rn then single_L p else double_L p
	\| T' (p as (_,T(_,_,T(_,ln,_,_),T(_,rn,_,_)),E)) =
	if ln>rn then single_R p else double_R p

	\| T' (p as (_,E,T(_,_,E,_))) = single_L p
	\| T' (p as (_,T(_,_,_,E),E)) = single_R p

	\| T' (p as (v,l as T(lv,ln,ll,lr),r as T(rv,rn,rl,rr))) =
	if rn>=weightln then (right is too big*)
	let val rln = size rl
	val rrn = size rr
	in
	if rln < rrn then single_L p else double_L p
	end

	else if ln>=weightrn then (left is too big*)
	let val lln = size ll
	val lrn = size lr
	in
	if lrn < lln then single_R p else double_R p
	end

	else
	T(v,ln+rn+1,l,r)

	fun add (E,x) = T(x,1,E,E)
	\| add (set as T(v,_,l,r),x) =
	if lt(x,v) then T'(v,add(l,x),r)
	else if lt(v,x) then T'(v,l,add(r,x))
	else set

	fun concat3 (E,v,r) = add(r,v)
	\| concat3 (l,v,E) = add(l,v)
	\| concat3 (l as T(v1,n1,l1,r1), v, r as T(v2,n2,l2,r2)) =
	if weight*n1 < n2 then T'(v2,concat3(l,v,l2),r2)
	else if weight*n2 < n1 then T'(v1,l1,concat3(r1,v,r))
	else N(v,l,r)

	fun split_lt (E,x) = E
	\| split_lt (t as T(v,_,l,r),x) =
	if lt(x,v) then split_lt(l,x)
	else if lt(v,x) then concat3(l,v,split_lt(r,x))
	else l

	fun split_gt (E,x) = E
	\| split_gt (t as T(v,_,l,r),x) =
	if lt(v,x) then split_gt(r,x)
	else if lt(x,v) then concat3(split_gt(l,x),v,r)
	else r

	fun min (T(v,_,E,_)) = v
	\| min (T(v,_,l,_)) = min l
	\| min _ = raise Match

	and delete' (E,r) = r
	\| delete' (l,E) = l
	\| delete' (l,r) = let val min_elt = min r in
	T'(min_elt,l,delmin r)
	end
	and delmin (T(_,_,E,r)) = r
	\| delmin (T(v,_,l,r)) = T'(v,delmin l,r)
	\| delmin _ = raise Match

	fun concat (E, s2) = s2
	\| concat (s1, E) = s1
	\| concat (t1 as T(v1,n1,l1,r1), t2 as T(v2,n2,l2,r2)) =
	if weight*n1 < n2 then T'(v2,concat(t1,l2),r2)
	else if weight*n2 < n1 then T'(v1,l1,concat(r1,t2))
	else T'(min t2,t1, delmin t2)

	fun fold(f,base,set) =
	let fun fold'(base,E) = base
	\| fold'(base,T(v,_,l,r)) = fold'(f(v,fold'(base,r)),l)
	in
	fold'(base,set)
	end

	in

	val empty = E

	fun singleton x = T(x,1,E,E)


	local
	fun trim (lo,hi,E) = E
	\| trim (lo,hi,s as T(v,_,l,r)) =
	if lt(lo,v) then
	if lt(v,hi) then s
	else trim(lo,hi,l)
	else trim(lo,hi,r)


	fun uni_bd (s,E,lo,hi) = s
	\| uni_bd (E,T(v,_,l,r),lo,hi) =
	concat3(split_gt(l,lo),v,split_lt(r,hi))
	\| uni_bd (T(v,_,l1,r1), s2 as T(v2,_,l2,r2),lo,hi) =
	concat3(uni_bd(l1,trim(lo,v,s2),lo,v),
	v,
	uni_bd(r1,trim(v,hi,s2),v,hi))
	(* inv: lo < v < hi *)

	(*all the other versions of uni and trim are
	specializations of the above two functions with
	lo=-infinity and/or hi=+infinity *)

	fun trim_lo (_ ,E) = E
	\| trim_lo (lo,s as T(v,_,_,r)) =
	if lt(lo,v) then s else trim_lo(lo,r)
	fun trim_hi (_ ,E) = E
	\| trim_hi (hi,s as T(v,_,l,_)) =
	if lt(v,hi) then s else trim_hi(hi,l)

	fun uni_hi (s,E,hi) = s
	\| uni_hi (E,T(v,_,l,r),hi) =
	concat3(l,v,split_lt(r,hi))
	\| uni_hi (T(v,_,l1,r1), s2 as T(v2,_,l2,r2),hi) =
	concat3(uni_hi(l1,trim_hi(v,s2),v),
	v,
	uni_bd(r1,trim(v,hi,s2),v,hi))

	fun uni_lo (s,E,lo) = s
	\| uni_lo (E,T(v,_,l,r),lo) =
	concat3(split_gt(l,lo),v,r)
	\| uni_lo (T(v,_,l1,r1), s2 as T(v2,_,l2,r2),lo) =
	concat3(uni_bd(l1,trim(lo,v,s2),lo,v),
	v,
	uni_lo(r1,trim_lo(v,s2),v))

	fun uni (s,E) = s
	\| uni (E,s as T(v,_,l,r)) = s
	\| uni (T(v,_,l1,r1), s2 as T(v2,_,l2,r2)) =
	concat3(uni_hi(l1,trim_hi(v,s2),v),
	v,
	uni_lo(r1,trim_lo(v,s2),v))

	in
	val hedge_union = uni
	end


	fun old_union (E,s2) = s2
	\| old_union (s1,E) = s1
	\| old_union (s1 as T(v,_,l,r),s2) =
	let val l2 = split_lt(s2,v)
	val r2 = split_gt(s2,v)
	in
	concat3(old_union(l,l2),v,old_union(r,r2))
	end

	(* The old_union version is about 20% slower than
	hedge_union in most cases *)

	val union = hedge_union
	(val union = old_union)

	val add = add

	fun difference (E,s) = E
	\| difference (s,E) = s
	\| difference (s, T(v,_,l,r)) =
	let val l2 = split_lt(s,v)
	val r2 = split_gt(s,v)
	in
	concat(difference(l2,l),difference(r2,r))
	end

	fun member (x,set) =
	let fun mem E = false
	\| mem (T(v,_,l,r)) =
	if lt(x,v) then mem l else if lt(v,x) then mem r else true
	in mem set end

	(fun intersection (a,b) = difference(a,difference(a,b)))

	fun intersection (E,_) = E
	\| intersection (_,E) = E
	\| intersection (s, T(v,_,l,r)) =
	let val l2 = split_lt(s,v)
	val r2 = split_gt(s,v)
	in
	if member(v,s) then
	concat3(intersection(l2,l),v,intersection(r2,r))
	else
	concat(intersection(l2,l),intersection(r2,r))
	end

	fun members set = fold(op::,[],set)

	fun cardinality E = 0
	\| cardinality (T(_,n,_,_)) = n

	fun delete (E,x) = E
	\| delete (set as T(v,_,l,r),x) =
	if lt(x,v) then T'(v,delete(l,x),r)
	else if lt(v,x) then T'(v,l,delete(r,x))
	else delete'(l,r)

	fun fromList l = List.fold (fn(x,y)=>add(y,x)) l E

	type intset = Set

	end
	end

	structure IntSet : INTSET =B;