Skip to content

Instantly share code, notes, and snippets.

@bcse
Last active December 25, 2015 21:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bcse/7046470 to your computer and use it in GitHub Desktop.
Save bcse/7046470 to your computer and use it in GitHub Desktop.
Python OrderedSet

OrderedSet is a set-like class that also keeps inserting order like list.

Here are some test results compared with ABC-based OrderedSet:

  My OrderedSet ABC-based Boost::MultiIndex-based
init with 100k random int 0.021977 0.118927 0.071423
union 0.041634 0.251826 0.095461
intersection 0.052491 0.138458 0.047158
difference 0.045817 0.103844 0.030975
symmetric_difference 0.114590 0.336064 0.059608
update 0.012893 0.076916 0.045201
intersection_update 0.051608 0.134896 0.066841
difference_update 0.045373 0.099654 0.043339
symmetric_difference_update 0.104951 0.153419 0.087790
add 0.112246 0.198614 0.090457
discard 22.204200 0.107926 4.758838
pop 0.069853 0.157067 0.027254
convert to list 0.001135 0.013994 0.004891
convert to reversed list 0.000101 0.000156 0.000457
[(i in a) for i in b] 0.044013 0.062753 0.021977
  • My OrderedSet is slightly faster than ABC-based OrderedSet in all tests, except discard. My OrderedSet's discard is way slower than ABC-based OrderedSet.
  • Since discard is slow, I didn't implment real in-place intersection, difference and symmetric_difference. They return a new OrderedSet instance instead of modify the original instance.
class OrderedSet(object):
"""A OrderedSet object is an ordered collection of distinct hashable objects.
It's designed to be a drop-in replacement of set.
Except `remove`, `pop` behavior is like a list, not a set."""
__hash__ = None
def __init__(self, iterable=None):
if isinstance(iterable, OrderedSet):
self.__set = iterable._get_set_copy()
self.__list = iterable._get_list_copy()
else:
self.__set = set()
self.__list = []
if iterable is not None:
self |= iterable
def _get_set_copy(self):
return self.__set.copy()
def _get_list_copy(self):
return self.__list[:]
def __len__(self):
return len(self.__set)
def __contains__(self, x):
return x in self.__set
def __iter__(self):
return iter(self.__list)
def __reversed__(self):
return reversed(self.__list)
def __getitem__(self, index):
return self.__list[index]
def __delitem__(self, index):
self.pop(index)
def __getslice__(self, i, j):
return self.__list[i:j]
def __le__(self, other):
if len(self) > len(other):
return False
for elem in self:
if elem not in other:
return False
return True
def __lt__(self, other):
return len(self) < len(other) and self.__le__(other)
def __gt__(self, other):
return other < self
def __ge__(self, other):
return other <= self
def __eq__(self, other):
return len(self) == len(other) and self.__le__(other)
def __ne__(self, other):
return not (self == other)
def isdisjoint(self, other):
"""Return True if two sets have a null intersection."""
for value in other:
if value in self:
return False
return True
def issubset(self, other):
"""Report whether another set contains this set."""
return self <= other
def issuperset(self, other):
"""Report whether this set contains another set."""
return self >= other
def index(self, elem):
return self.__list.index(elem)
@classmethod
def _from_iterable(cls, iterable):
"""Construct an instance of the class from any iterable input."""
return cls(iterable)
def __and__(self, other):
other = self._from_iterable(other)
return self._from_iterable(value for value in self if value in other)
def __or__(self, other):
chain = (e for s in (self, other) for e in s)
return self._from_iterable(chain)
def __sub__(self, other):
other = self._from_iterable(other)
return self._from_iterable(value for value in self if value not in other)
def __xor__(self, other):
other = self._from_iterable(other)
return (self - other) | (other - self)
def union(self, other):
"""Return a new set with elements from the set and the other."""
return self | other
def intersection(self, other):
"""Return a new set with elements common to the set and the other."""
return self & other
def difference(self, other):
"""Return a new set with elements in the set that are not in the other."""
return self - other
def symmetric_difference(self, other):
"""Return a new set with elements in either the set or other but not both."""
return self ^ other
def copy(self):
return self._from_iterable(self)
def add(self, elem):
"""Append an element to the end of set.
This has no effect if the element is already present."""
if elem not in self:
self.__set.add(elem)
self.__list.append(elem)
def remove(self, elem):
"""Remove an element from the set.
Raises ValueError if the element is not present."""
if elem not in self:
raise ValueError('x not in set')
self.discard(elem)
def discard(self, elem):
"""Remove an element from the set.
This has no effect if the element is not present."""
import warnings
warnings.warn('This function have serious performance issue. '
'You should use `difference` when removing large amout of items.')
if elem in self:
self.__set.discard(elem)
self.__list.remove(elem)
def pop(self, index=-1):
"""Remove and return item at index (default last).
Raises IndexError if set is empty or index is out of range."""
set_len = len(self.__set)
if set_len == 0 or set_len <= index:
raise IndexError('pop from empty set')
value = self.__list.pop(index)
self.__set.discard(value)
return value
def clear(self):
"""Remove all elements from the set."""
self.__set.clear()
self.__list = []
def __ior__(self, iterable):
self.update(iterable)
return self
# TODO: Implement fast in-place and. Without this, `a &= b` will return a new `a`.
#def __iand__(self, iterable):
# self.intersection_update(iterable)
# return self
# TODO: Implement fast in-place subtract. Without this, `a -= b` will return a new `a`.
#def __isub__(self, iterable):
# self.difference_update(iterable)
# return self
# TODO: Implement fast in-place xor. Without this, `a ^= b` will return a new `a`.
#def __ixor__(self, iterable):
# self.symmetric_difference_update(iterable)
# return self
def update(self, other):
"""Update the set, keeping only elements found in it and the other."""
seen = self.__set
seen_add = seen.add
self.__list.extend(i for i in other if not i in seen and not seen_add(i))
# FIXME: Following functions doesn't work as expected. Please use &= instead.
#def intersection_update(self, other):
# """Update the set, removing elements found in the other."""
# for value in (self - iterable):
# self.discard(value)
# FIXME: Following functions doesn't work as expected. Please use -= instead.
#def difference_update(self, other):
# """Update the set, removing elements found in the other."""
# for value in iterable:
# if value in self:
# self.discard(value)
# else:
# self.add(value)
# FIXME: Following functions doesn't work as expected. Please use ^= instead.
#def symmetric_difference_update(self, other):
# """Update the set, keeping only elements found in either set, but not in both."""
# for value in iterable:
# self.discard(value)
def __repr__(self):
return 'OrderedSet(%s)' % repr(self.__list)
def __str__(self):
return repr(self)
def __unicode__(self):
return unicode(str(self))
def __del__(self):
self.clear()
if __name__ == '__main__':
a = OrderedSet('abracadabra')
b = OrderedSet('ommanipadmehum')
print(a)
print(b)
print(a < b)
print(a <= b)
print(a == b)
print(a != b)
print(a >= b)
print(a > b)
print(a | b)
print(b | a)
print(a & b)
print(b & a)
print(a - b)
print(b - a)
print(a ^ b)
print(b ^ a)
print('Benchmark...')
from random import randint
from time import time
data1 = [randint(1, 100000) for _ in xrange(100000)]
data2 = [randint(1, 100000) for _ in xrange(100000)]
t0 = time()
OrderedSet(data1)
t = time() - t0
print('init with 100k random integers: %fs' % t)
a = OrderedSet(data1)
b = OrderedSet(data2)
t0 = time()
c = a | b
t = time() - t0
print('a | b: %fs' % t)
a = OrderedSet(data1)
b = OrderedSet(data2)
t0 = time()
c = a & b
t = time() - t0
print('a & b: %fs' % t)
a = OrderedSet(data1)
b = OrderedSet(data2)
t0 = time()
c = a - b
t = time() - t0
print('a - b: %fs' % t)
a = OrderedSet(data1)
b = OrderedSet(data2)
t0 = time()
c = a ^ b
t = time() - t0
print('a ^ b: %fs' % t)
a = OrderedSet(data1)
aid = id(a)
b = OrderedSet(data2)
t0 = time()
a |= b
t = time() - t0
print('a |= b: %fs %s' % (t, '' if aid == id(a) else '(not in-place)'))
a = OrderedSet(data1)
aid = id(a)
b = OrderedSet(data2)
t0 = time()
a &= b
t = time() - t0
print('a &= b: %fs %s' % (t, '' if aid == id(a) else '(not in-place)'))
a = OrderedSet(data1)
aid = id(a)
b = OrderedSet(data2)
t0 = time()
a -= b
t = time() - t0
print('a -= b: %fs %s' % (t, '' if aid == id(a) else '(not in-place)'))
a = OrderedSet(data1)
aid = id(a)
b = OrderedSet(data2)
t0 = time()
a ^= b
t = time() - t0
print('a ^= b: %fs %s' % (t, '' if aid == id(a) else '(not in-place)'))
a = OrderedSet()
t0 = time()
c = [a.copy() for _ in xrange(100000)]
t = time() - t0
print('create 100k shallow copies: %fs' % t)
a = OrderedSet()
t0 = time()
[a.add(i) for i in data1]
t = time() - t0
print('random add 100k items: %fs' % t)
a = OrderedSet(data1)
alen = len(a)
t0 = time()
[a.__delitem__(i) for i in xrange(alen-1, -1, -1)]
t = time() - t0
print('reversed sequential delete %d items: %fs' % (alen, t))
a = OrderedSet(data1)
t0 = time()
[a.discard(i) for i in data2]
t = time() - t0
print('random discard 100k items: %fs' % t)
a = OrderedSet(data1)
t0 = time()
while 1:
try:
a.pop()
except IndexError:
break
t = time() - t0
print('pop all items: %fs' % t)
a = OrderedSet(data1)
t0 = time()
c = list(a)
t = time() - t0
print('list(a): %fs' % t)
a = OrderedSet(data1)
t0 = time()
c = reversed(a)
t = time() - t0
print('reversed(a): %fs' % t)
a = OrderedSet(data1)
b = OrderedSet(data2)
t0 = time()
[(i in a) for i in b]
t = time() - t0
print('[(i in a) for i in b]: %fs' % t)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment