Last active
October 12, 2018 09:23
-
-
Save edelooff/d21655bb289e7230e73f363fd433e026 to your computer and use it in GitHub Desktop.
Collection length checking before determining duplicate items
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import timeit | |
NO_DUPLICATES = ( | |
'one', 'two', 'three', 'four', 'five', 'six', | |
'seven', 'eight', 'nine', 'ten', 'eleven', 'twelve') | |
WITH_DUPLICATES = ( | |
'one', 'two', 'three', 'four', 'five', 'six', | |
'seven', 'one', 'nine', 'two', 'eleven', 'three') | |
def check_length_first(collection): | |
if len(collection) != len(set(collection)): | |
return {x for x in collection if collection.count(x) > 1} | |
def immediate_duplicates(collection): | |
return {x for x in collection if collection.count(x) > 1} | |
def seen_set_duplicates(collection): | |
"""Builds a set of unique entries, returns items already contained. O(n)""" | |
seen = set() | |
return {x for x in collection if x in seen or seen.add(x)} | |
def call_latency(collection): | |
pass | |
if __name__ == '__main__': | |
print 'Just the function calls: {}'.format(timeit.timeit( | |
'call_latency(NO_DUPLICATES)', | |
setup='from __main__ import call_latency, NO_DUPLICATES')) | |
print '\nLength check first, no dupes: {}'.format(timeit.timeit( | |
'check_length_first(NO_DUPLICATES)', | |
setup='from __main__ import check_length_first, NO_DUPLICATES')) | |
print 'Length check first, dupes: {}'.format(timeit.timeit( | |
'check_length_first(WITH_DUPLICATES)', | |
setup='from __main__ import check_length_first, WITH_DUPLICATES')) | |
print '\nNo length check, no dupes: {}'.format(timeit.timeit( | |
'immediate_duplicates(NO_DUPLICATES)', | |
setup='from __main__ import immediate_duplicates, NO_DUPLICATES')) | |
print 'No Length check, dupes: {}'.format(timeit.timeit( | |
'immediate_duplicates(WITH_DUPLICATES)', | |
setup='from __main__ import immediate_duplicates, WITH_DUPLICATES')) | |
print '\nSeen-set based, no dupes: {}'.format(timeit.timeit( | |
'seen_set_duplicates(NO_DUPLICATES)', | |
setup='from __main__ import seen_set_duplicates, NO_DUPLICATES')) | |
print 'Seen-set based, dupes: {}'.format(timeit.timeit( | |
'seen_set_duplicates(WITH_DUPLICATES)', | |
setup='from __main__ import seen_set_duplicates, WITH_DUPLICATES')) | |
# Example timings from a Core i5-6200U | |
# Just the function calls: 0.0743689537048 | |
# Length check first, no dupes: 0.500941991806 | |
# Length check first, dupes: 3.38291621208 | |
# No length check, no dupes: 2.72766900063 | |
# No Length check, dupes: 2.82286787033 | |
# Seen-set based, no dupes: 1.60548996925 | |
# Seen-set based, dupes: 1.44544911385 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment