Skip to content

Instantly share code, notes, and snippets.

@lrhache
Last active September 30, 2021 12:26
Show Gist options
  • Save lrhache/36a9a5ea5fe7e1f121e3 to your computer and use it in GitHub Desktop.
Save lrhache/36a9a5ea5fe7e1f121e3 to your computer and use it in GitHub Desktop.
Two method to remove duplicates from a python list while keeping order
"""Benchmark multiple method of removing duplicates in a list
while keeping item's order in the list.
numpy.unique will likely be faster and more efficient
the more the list grows.
If a list is submitted to numpy.unique, it will be transformed and flattened
into a numpy.Array which it's data structure is more efficient than standard
lists since all elements have the same size in memory therefor requires less
access to the memory. Lists are stored as array of adresses pointing to
to the objects and requires more memory access (but more flexible).
Requirements:
pip install numpy prettytable
Usage:
python list_unique_benchmark.py <number of items>
"""
import math
import timeit
import itertools
import argparse
import numpy as np
from prettytable import PrettyTable
from random import shuffle
import multiprocessing as mp
def numpy_unique(seq):
"""Remove duplicate from a list
Required:
seq: A list containing all items
Returns:
A list with only unique values. Only the first occurence is kept.
"""
return np.unique(seq)
def numpy_unique_ordered(seq):
"""Remove duplicate from a list while keeping order with Numpy.unique
Required:
seq: A list containing all items
Returns:
A list with only unique values. Only the first occurence is kept.
"""
array_unique = np.unique(seq, return_index=True)
dstack = np.dstack(array_unique)
dstack.dtype = np.dtype([('v', dstack.dtype), ('i', dstack.dtype)])
dstack.sort(order='i', axis=1)
return dstack.flatten()['v'].tolist()
def list_unique(seq):
"""Remove duplicate from list while keeping order
Required arguments:
seq: A list containing all objects
Returns:
A list with only unique values. Only the first occurence is kept.
"""
def filter_uniques(seq):
# We use set() since it's faster at lookups.
seen = set()
for x in seq:
if x in seen:
continue
seen.add(x)
yield x
return list(filter_uniques(seq))
def random_string():
""" Create a random string
Returns:
A random string lowercase
"""
length = 4
chars = 'abcdefghijklmnopqrstuvwxyz'
char_list = list(chars)
shuffle(char_list)
string = ''.join(char_list)
return string[:length]
def get_testdata(nb_items):
"""Create a list of test data
Required arguments:
nb_items: The number of item to generate in list
Returns:
A list of random string which total item is equal to nb_items
"""
chunks_size = 1000
nb_chunks = int(math.floor(nb_items/chunks_size))
chunks = [chunks_size] * nb_chunks
chunks.append(nb_items - (nb_chunks*chunks_size))
pool = mp.Pool()
result = pool.map_async(gen_data, chunks)
return list(itertools.chain(*result.get()))
def gen_data(n):
data = []
for i in xrange(n):
data.append(random_string())
return data
def benchmark(fn, *args):
def wrapper():
fn(*args)
time = timeit.Timer(wrapper).timeit(10)
return time
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Benchmark list unique.')
parser.add_argument('nb_items', metavar='<nb of items>', type=int,
nargs='?', help='number of items to generate')
args = parser.parse_args()
nb_items = args.nb_items
if not nb_items:
items_count = [1000, 10000, 1000000, 10000000]
else:
items_count = [nb_items]
print "Generating %s items for test data" % nb_items
testdata = get_testdata(items_count[-1])
test_fn = list_unique, numpy_unique, numpy_unique_ordered
table = PrettyTable([''] + items_count)
for fn in test_fn:
results = [benchmark(fn, testdata[:count]) for count in items_count]
table.add_row([fn.__name__] + results)
print table
+----------------------+------------------+-----------------+---------------+---------------+
| | 1000 | 10000 | 1000000 | 10000000 |
+----------------------+------------------+-----------------+---------------+---------------+
| list_unique | 0.00201988220215 | 0.0215442180634 | 2.15613508224 | 21.0727329254 |
| numpy_unique | 0.00214719772339 | 0.0246329307556 | 2.45695090294 | 12.1858968735 |
| numpy_unique_ordered | 0.00900387763977 | 0.107609987259 | 7.05862998962 | 44.9925370216 |
+----------------------+------------------+-----------------+---------------+---------------+
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment