Skip to content

Instantly share code, notes, and snippets.

@btaba
Created July 8, 2018 20:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save btaba/f9855f942b46f93f641a04c09ab64266 to your computer and use it in GitHub Desktop.
Save btaba/f9855f942b46f93f641a04c09ab64266 to your computer and use it in GitHub Desktop.
import os
import time
import shutil
import random
import heapq
def remove_file(f):
if os.path.exists(f):
os.remove(f)
return f
def create_file(num=10e6):
with open(remove_file('bigfile.txt'), 'a') as f:
for _ in range(int(num)):
f.write(str(random.random()) + '\n')
return 'bigfile.txt'
def stream_file(f):
with open(f, 'r') as f:
for i in f:
yield float(i.strip())
def get_next(s):
try:
return next(s)
except StopIteration:
return None
def mergestreams(list_of_streams):
# merge a list of streams using a heap
# Nlog(k) run-time
pq = []
for idx, s in enumerate(list_of_streams):
el = get_next(s)
if el:
heapq.heappush(pq, (el, idx))
while len(pq) > 0:
el, idx = heapq.heappop(pq)
yield el
el = get_next(list_of_streams[idx])
if el:
heapq.heappush(pq, (el, idx))
def mergesort(stream, chunk_size=100000):
# chunk the stream into files, sorted using heap-sort
file_count = 0
pq = []
for i, el in enumerate(stream):
heapq.heappush(pq, el)
if i % chunk_size == 0:
if i > 0:
while len(pq) > 0:
f.write(str(heapq.heappop(pq)) + '\n')
f.close()
f = open(remove_file('chunk{}.txt'.format(file_count)), 'a')
file_count += 1
while len(pq) > 0:
f.write(str(heapq.heappop(pq)) + '\n')
f.close()
files = ['chunk{}.txt'.format(i) for i in range(file_count)]
# merge all chunk streams by using a heap
list_of_streams = [stream_file(f) for f in files]
with open(remove_file('last_file.txt'), 'a') as f1:
for i in mergestreams(list_of_streams):
f1.write(str(i) + '\n')
return stream_file('last_file.txt')
def inmem_merge(a, l, m, r):
i, j = l, m
res = []
while i < m and j < r:
if a[i] < a[j]:
res.append(a[i])
i += 1
else:
res.append(a[j])
j += 1
while i < m:
res.append(a[i])
i += 1
while j < r:
res.append(a[j])
j += 1
a[l:r] = res
def inmem_mergesort(a, l, r):
if (r - 1) <= l:
return
m = (r - l) // 2 + l
inmem_mergesort(a, l, m)
inmem_mergesort(a, m, r)
inmem_merge(a, l, m, r)
if __name__ == '__main__':
s = list(stream_file(create_file(50e6)))
tstart = time.time()
res = mergesort(s, chunk_size=int(1e6))
print('File based mergesort time (min): {}'.format((time.time() - tstart) / 60))
last = 0
for i in res:
assert last <= i
last = i
print('File based merge sort passed test')
# s = list(stream_file(create_file(50e6)))
random.shuffle(s)
tstart = time.time()
inmem_mergesort(s, 0, len(s))
print('Mergesort time (min): {}'.format((time.time() - tstart) / 60))
last = 0
for i in s:
assert last <= i
last = i
print('Merge sort passed test')
random.shuffle(s)
tstart = time.time()
sorted(s)
print('Sorted time (min): {}'.format((time.time() - tstart) / 60))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment