lrhache/list_unique_benchmark.py

## list_unique_benchmark.py
"""Benchmark multiple method of removing duplicates in a list
    while keeping item's order in the list.

    numpy.unique will likely be faster and more efficient
    the more the list grows.
    If a list is submitted to numpy.unique, it will be transformed and flattened
    into a numpy.Array which it's data structure is more efficient than standard
    lists since all elements have the same size in memory therefor requires less
    access to the memory. Lists are stored as array of adresses pointing to
    to the objects and requires more memory access (but more flexible).

    Requirements:
        pip install numpy prettytable

    Usage:
        python list_unique_benchmark.py <number of items>
"""
import math
import timeit
import itertools
import argparse
import numpy as np
from prettytable import PrettyTable
from random import shuffle
import multiprocessing as mp


def numpy_unique(seq):
    """Remove duplicate from a list

    Required:
        seq: A list containing all items

    Returns:
        A list with only unique values. Only the first occurence is kept.
    """
    return np.unique(seq)


def numpy_unique_ordered(seq):
    """Remove duplicate from a list while keeping order with Numpy.unique

    Required:
        seq: A list containing all items

    Returns:
        A list with only unique values. Only the first occurence is kept.
    """
    array_unique = np.unique(seq, return_index=True)
    dstack = np.dstack(array_unique)
    dstack.dtype = np.dtype([('v', dstack.dtype), ('i', dstack.dtype)])
    dstack.sort(order='i', axis=1)
    return dstack.flatten()['v'].tolist()


def list_unique(seq):
    """Remove duplicate from list while keeping order

    Required arguments:
        seq: A list containing all objects

    Returns:
        A list with only unique values. Only the first occurence is kept.
    """
    def filter_uniques(seq):
        # We use set() since it's faster at lookups.
        seen = set()
        for x in seq:
            if x in seen:
                continue
            seen.add(x)
            yield x

    return list(filter_uniques(seq))


def random_string():
    """ Create a random string

    Returns:
        A random string lowercase
    """
    length = 4
    chars = 'abcdefghijklmnopqrstuvwxyz'
    char_list = list(chars)
    shuffle(char_list)
    string = ''.join(char_list)
    return string[:length]


def get_testdata(nb_items):
    """Create a list of test data

    Required arguments:
        nb_items: The number of item to generate in list

    Returns:
        A list of random string which total item is equal to nb_items
    """
    chunks_size = 1000
    nb_chunks = int(math.floor(nb_items/chunks_size))
    chunks = [chunks_size] * nb_chunks
    chunks.append(nb_items - (nb_chunks*chunks_size))

    pool = mp.Pool()
    result = pool.map_async(gen_data, chunks)
    return list(itertools.chain(*result.get()))


def gen_data(n):
    data = []
    for i in xrange(n):
        data.append(random_string())
    return data


def benchmark(fn, *args):
    def wrapper():
        fn(*args)

    time = timeit.Timer(wrapper).timeit(10)
    return time

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Benchmark list unique.')
    parser.add_argument('nb_items', metavar='<nb of items>', type=int,
                        nargs='?', help='number of items to generate')
    args = parser.parse_args()
    nb_items = args.nb_items

    if not nb_items:
        items_count = [1000, 10000, 1000000, 10000000]
    else:
        items_count = [nb_items]

    print "Generating %s items for test data" % nb_items
    testdata = get_testdata(items_count[-1])

    test_fn = list_unique, numpy_unique, numpy_unique_ordered
    table = PrettyTable([''] + items_count)

    for fn in test_fn:
        results = [benchmark(fn, testdata[:count]) for count in items_count]
        table.add_row([fn.__name__] + results)

    print table

## results.txt
+----------------------+------------------+-----------------+---------------+---------------+
|                      |       1000       |      10000      |    1000000    |    10000000   |
+----------------------+------------------+-----------------+---------------+---------------+
|     list_unique      | 0.00201988220215 | 0.0215442180634 | 2.15613508224 | 21.0727329254 |
|     numpy_unique     | 0.00214719772339 | 0.0246329307556 | 2.45695090294 | 12.1858968735 |
| numpy_unique_ordered | 0.00900387763977 |  0.107609987259 | 7.05862998962 | 44.9925370216 |
+----------------------+------------------+-----------------+---------------+---------------+
	"""Benchmark multiple method of removing duplicates in a list
	while keeping item's order in the list.

	numpy.unique will likely be faster and more efficient
	the more the list grows.
	If a list is submitted to numpy.unique, it will be transformed and flattened
	into a numpy.Array which it's data structure is more efficient than standard
	lists since all elements have the same size in memory therefor requires less
	access to the memory. Lists are stored as array of adresses pointing to
	to the objects and requires more memory access (but more flexible).

	Requirements:
	pip install numpy prettytable

	Usage:
	python list_unique_benchmark.py <number of items>
	"""
	import math
	import timeit
	import itertools
	import argparse
	import numpy as np
	from prettytable import PrettyTable
	from random import shuffle
	import multiprocessing as mp


	def numpy_unique(seq):
	"""Remove duplicate from a list

	Required:
	seq: A list containing all items

	Returns:
	A list with only unique values. Only the first occurence is kept.
	"""
	return np.unique(seq)


	def numpy_unique_ordered(seq):
	"""Remove duplicate from a list while keeping order with Numpy.unique

	Required:
	seq: A list containing all items

	Returns:
	A list with only unique values. Only the first occurence is kept.
	"""
	array_unique = np.unique(seq, return_index=True)
	dstack = np.dstack(array_unique)
	dstack.dtype = np.dtype([('v', dstack.dtype), ('i', dstack.dtype)])
	dstack.sort(order='i', axis=1)
	return dstack.flatten()['v'].tolist()



	def list_unique(seq):
	"""Remove duplicate from list while keeping order

	Required arguments:
	seq: A list containing all objects

	Returns:
	A list with only unique values. Only the first occurence is kept.
	"""
	def filter_uniques(seq):
	# We use set() since it's faster at lookups.
	seen = set()
	for x in seq:
	if x in seen:
	continue
	seen.add(x)
	yield x

	return list(filter_uniques(seq))


	def random_string():
	""" Create a random string

	Returns:
	A random string lowercase
	"""
	length = 4
	chars = 'abcdefghijklmnopqrstuvwxyz'
	char_list = list(chars)
	shuffle(char_list)
	string = ''.join(char_list)
	return string[:length]


	def get_testdata(nb_items):
	"""Create a list of test data

	Required arguments:
	nb_items: The number of item to generate in list

	Returns:
	A list of random string which total item is equal to nb_items
	"""
	chunks_size = 1000
	nb_chunks = int(math.floor(nb_items/chunks_size))
	chunks = [chunks_size] * nb_chunks
	chunks.append(nb_items - (nb_chunks*chunks_size))

	pool = mp.Pool()
	result = pool.map_async(gen_data, chunks)
	return list(itertools.chain(*result.get()))


	def gen_data(n):
	data = []
	for i in xrange(n):
	data.append(random_string())
	return data


	def benchmark(fn, *args):
	def wrapper():
	fn(*args)

	time = timeit.Timer(wrapper).timeit(10)
	return time

	if __name__ == '__main__':
	parser = argparse.ArgumentParser(description='Benchmark list unique.')
	parser.add_argument('nb_items', metavar='<nb of items>', type=int,
	nargs='?', help='number of items to generate')
	args = parser.parse_args()
	nb_items = args.nb_items

	if not nb_items:
	items_count = [1000, 10000, 1000000, 10000000]
	else:
	items_count = [nb_items]

	print "Generating %s items for test data" % nb_items
	testdata = get_testdata(items_count[-1])

	test_fn = list_unique, numpy_unique, numpy_unique_ordered
	table = PrettyTable([''] + items_count)

	for fn in test_fn:
	results = [benchmark(fn, testdata[:count]) for count in items_count]
	table.add_row([fn.__name__] + results)

	print table
	+----------------------+------------------+-----------------+---------------+---------------+
	\| \| 1000 \| 10000 \| 1000000 \| 10000000 \|
	+----------------------+------------------+-----------------+---------------+---------------+
	\| list_unique \| 0.00201988220215 \| 0.0215442180634 \| 2.15613508224 \| 21.0727329254 \|
	\| numpy_unique \| 0.00214719772339 \| 0.0246329307556 \| 2.45695090294 \| 12.1858968735 \|
	\| numpy_unique_ordered \| 0.00900387763977 \| 0.107609987259 \| 7.05862998962 \| 44.9925370216 \|
	+----------------------+------------------+-----------------+---------------+---------------+