Skip to content

Instantly share code, notes, and snippets.

@benhall-7
Created September 29, 2020 02:40
Show Gist options
  • Save benhall-7/18e510f3222318c61ed43d4e87ecfef6 to your computer and use it in GitHub Desktop.
Save benhall-7/18e510f3222318c61ed43d4e87ecfef6 to your computer and use it in GitHub Desktop.
Duplicate name search between two lists with minimal allocation
import time
start_time = time.time()
f = open('names_1.txt', 'r')
names_1 = f.read().split("\n") # List containing 10000 names
f.close()
# takes at most log(n) space, which is less than a second array
names_1.sort()
duplicates = []
def bsearch(arr, val):
l = 0
r = len(arr) - 1
# check both endpoints before starting the loop
if arr[l] > val or arr[r] < val:
return False
if arr[l] == val or arr[r] == val:
return True
# if we've made it this far, neither endpoint is equal to the value,
# and the value may lie in between
while r - l >= 2:
mid = (l + r) // 2
# by assigning only one endpoint at a time we avoid having to check
# both each time we go through here. The else block is all we need
if arr[mid] > val:
r = mid
elif arr[mid] < val:
l = mid
else:
return True
return False
with open('names_2.txt', 'r') as f:
# iterate through the file instead of creating a second array
for line in f:
# don't allocate new strings. Use slice instead, and remove trailing newlines
if line[-1] == "\n":
name = line[0:-1]
else:
name = line[0:]
# now that we have the name, binary search the sorted list, which is O(b*log(a))
if bsearch(names_1, name):
duplicates.append(name)
end_time = time.time()
print (f"{len(duplicates)} duplicates:\n\n{', '.join(duplicates)}\n\n")
print (f"runtime: {end_time - start_time} seconds")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment