Skip to content

Instantly share code, notes, and snippets.

@edelooff
Last active October 12, 2018 09:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save edelooff/d21655bb289e7230e73f363fd433e026 to your computer and use it in GitHub Desktop.
Save edelooff/d21655bb289e7230e73f363fd433e026 to your computer and use it in GitHub Desktop.
Collection length checking before determining duplicate items
import timeit
NO_DUPLICATES = (
'one', 'two', 'three', 'four', 'five', 'six',
'seven', 'eight', 'nine', 'ten', 'eleven', 'twelve')
WITH_DUPLICATES = (
'one', 'two', 'three', 'four', 'five', 'six',
'seven', 'one', 'nine', 'two', 'eleven', 'three')
def check_length_first(collection):
if len(collection) != len(set(collection)):
return {x for x in collection if collection.count(x) > 1}
def immediate_duplicates(collection):
return {x for x in collection if collection.count(x) > 1}
def seen_set_duplicates(collection):
"""Builds a set of unique entries, returns items already contained. O(n)"""
seen = set()
return {x for x in collection if x in seen or seen.add(x)}
def call_latency(collection):
pass
if __name__ == '__main__':
print 'Just the function calls: {}'.format(timeit.timeit(
'call_latency(NO_DUPLICATES)',
setup='from __main__ import call_latency, NO_DUPLICATES'))
print '\nLength check first, no dupes: {}'.format(timeit.timeit(
'check_length_first(NO_DUPLICATES)',
setup='from __main__ import check_length_first, NO_DUPLICATES'))
print 'Length check first, dupes: {}'.format(timeit.timeit(
'check_length_first(WITH_DUPLICATES)',
setup='from __main__ import check_length_first, WITH_DUPLICATES'))
print '\nNo length check, no dupes: {}'.format(timeit.timeit(
'immediate_duplicates(NO_DUPLICATES)',
setup='from __main__ import immediate_duplicates, NO_DUPLICATES'))
print 'No Length check, dupes: {}'.format(timeit.timeit(
'immediate_duplicates(WITH_DUPLICATES)',
setup='from __main__ import immediate_duplicates, WITH_DUPLICATES'))
print '\nSeen-set based, no dupes: {}'.format(timeit.timeit(
'seen_set_duplicates(NO_DUPLICATES)',
setup='from __main__ import seen_set_duplicates, NO_DUPLICATES'))
print 'Seen-set based, dupes: {}'.format(timeit.timeit(
'seen_set_duplicates(WITH_DUPLICATES)',
setup='from __main__ import seen_set_duplicates, WITH_DUPLICATES'))
# Example timings from a Core i5-6200U
# Just the function calls: 0.0743689537048
# Length check first, no dupes: 0.500941991806
# Length check first, dupes: 3.38291621208
# No length check, no dupes: 2.72766900063
# No Length check, dupes: 2.82286787033
# Seen-set based, no dupes: 1.60548996925
# Seen-set based, dupes: 1.44544911385
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment