Last active
September 30, 2021 12:26
-
-
Save lrhache/36a9a5ea5fe7e1f121e3 to your computer and use it in GitHub Desktop.
Two method to remove duplicates from a python list while keeping order
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Benchmark multiple method of removing duplicates in a list | |
while keeping item's order in the list. | |
numpy.unique will likely be faster and more efficient | |
the more the list grows. | |
If a list is submitted to numpy.unique, it will be transformed and flattened | |
into a numpy.Array which it's data structure is more efficient than standard | |
lists since all elements have the same size in memory therefor requires less | |
access to the memory. Lists are stored as array of adresses pointing to | |
to the objects and requires more memory access (but more flexible). | |
Requirements: | |
pip install numpy prettytable | |
Usage: | |
python list_unique_benchmark.py <number of items> | |
""" | |
import math | |
import timeit | |
import itertools | |
import argparse | |
import numpy as np | |
from prettytable import PrettyTable | |
from random import shuffle | |
import multiprocessing as mp | |
def numpy_unique(seq): | |
"""Remove duplicate from a list | |
Required: | |
seq: A list containing all items | |
Returns: | |
A list with only unique values. Only the first occurence is kept. | |
""" | |
return np.unique(seq) | |
def numpy_unique_ordered(seq): | |
"""Remove duplicate from a list while keeping order with Numpy.unique | |
Required: | |
seq: A list containing all items | |
Returns: | |
A list with only unique values. Only the first occurence is kept. | |
""" | |
array_unique = np.unique(seq, return_index=True) | |
dstack = np.dstack(array_unique) | |
dstack.dtype = np.dtype([('v', dstack.dtype), ('i', dstack.dtype)]) | |
dstack.sort(order='i', axis=1) | |
return dstack.flatten()['v'].tolist() | |
def list_unique(seq): | |
"""Remove duplicate from list while keeping order | |
Required arguments: | |
seq: A list containing all objects | |
Returns: | |
A list with only unique values. Only the first occurence is kept. | |
""" | |
def filter_uniques(seq): | |
# We use set() since it's faster at lookups. | |
seen = set() | |
for x in seq: | |
if x in seen: | |
continue | |
seen.add(x) | |
yield x | |
return list(filter_uniques(seq)) | |
def random_string(): | |
""" Create a random string | |
Returns: | |
A random string lowercase | |
""" | |
length = 4 | |
chars = 'abcdefghijklmnopqrstuvwxyz' | |
char_list = list(chars) | |
shuffle(char_list) | |
string = ''.join(char_list) | |
return string[:length] | |
def get_testdata(nb_items): | |
"""Create a list of test data | |
Required arguments: | |
nb_items: The number of item to generate in list | |
Returns: | |
A list of random string which total item is equal to nb_items | |
""" | |
chunks_size = 1000 | |
nb_chunks = int(math.floor(nb_items/chunks_size)) | |
chunks = [chunks_size] * nb_chunks | |
chunks.append(nb_items - (nb_chunks*chunks_size)) | |
pool = mp.Pool() | |
result = pool.map_async(gen_data, chunks) | |
return list(itertools.chain(*result.get())) | |
def gen_data(n): | |
data = [] | |
for i in xrange(n): | |
data.append(random_string()) | |
return data | |
def benchmark(fn, *args): | |
def wrapper(): | |
fn(*args) | |
time = timeit.Timer(wrapper).timeit(10) | |
return time | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description='Benchmark list unique.') | |
parser.add_argument('nb_items', metavar='<nb of items>', type=int, | |
nargs='?', help='number of items to generate') | |
args = parser.parse_args() | |
nb_items = args.nb_items | |
if not nb_items: | |
items_count = [1000, 10000, 1000000, 10000000] | |
else: | |
items_count = [nb_items] | |
print "Generating %s items for test data" % nb_items | |
testdata = get_testdata(items_count[-1]) | |
test_fn = list_unique, numpy_unique, numpy_unique_ordered | |
table = PrettyTable([''] + items_count) | |
for fn in test_fn: | |
results = [benchmark(fn, testdata[:count]) for count in items_count] | |
table.add_row([fn.__name__] + results) | |
print table |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
+----------------------+------------------+-----------------+---------------+---------------+ | |
| | 1000 | 10000 | 1000000 | 10000000 | | |
+----------------------+------------------+-----------------+---------------+---------------+ | |
| list_unique | 0.00201988220215 | 0.0215442180634 | 2.15613508224 | 21.0727329254 | | |
| numpy_unique | 0.00214719772339 | 0.0246329307556 | 2.45695090294 | 12.1858968735 | | |
| numpy_unique_ordered | 0.00900387763977 | 0.107609987259 | 7.05862998962 | 44.9925370216 | | |
+----------------------+------------------+-----------------+---------------+---------------+ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment