Created
July 8, 2018 20:31
-
-
Save btaba/f9855f942b46f93f641a04c09ab64266 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import time | |
import shutil | |
import random | |
import heapq | |
def remove_file(f): | |
if os.path.exists(f): | |
os.remove(f) | |
return f | |
def create_file(num=10e6): | |
with open(remove_file('bigfile.txt'), 'a') as f: | |
for _ in range(int(num)): | |
f.write(str(random.random()) + '\n') | |
return 'bigfile.txt' | |
def stream_file(f): | |
with open(f, 'r') as f: | |
for i in f: | |
yield float(i.strip()) | |
def get_next(s): | |
try: | |
return next(s) | |
except StopIteration: | |
return None | |
def mergestreams(list_of_streams): | |
# merge a list of streams using a heap | |
# Nlog(k) run-time | |
pq = [] | |
for idx, s in enumerate(list_of_streams): | |
el = get_next(s) | |
if el: | |
heapq.heappush(pq, (el, idx)) | |
while len(pq) > 0: | |
el, idx = heapq.heappop(pq) | |
yield el | |
el = get_next(list_of_streams[idx]) | |
if el: | |
heapq.heappush(pq, (el, idx)) | |
def mergesort(stream, chunk_size=100000): | |
# chunk the stream into files, sorted using heap-sort | |
file_count = 0 | |
pq = [] | |
for i, el in enumerate(stream): | |
heapq.heappush(pq, el) | |
if i % chunk_size == 0: | |
if i > 0: | |
while len(pq) > 0: | |
f.write(str(heapq.heappop(pq)) + '\n') | |
f.close() | |
f = open(remove_file('chunk{}.txt'.format(file_count)), 'a') | |
file_count += 1 | |
while len(pq) > 0: | |
f.write(str(heapq.heappop(pq)) + '\n') | |
f.close() | |
files = ['chunk{}.txt'.format(i) for i in range(file_count)] | |
# merge all chunk streams by using a heap | |
list_of_streams = [stream_file(f) for f in files] | |
with open(remove_file('last_file.txt'), 'a') as f1: | |
for i in mergestreams(list_of_streams): | |
f1.write(str(i) + '\n') | |
return stream_file('last_file.txt') | |
def inmem_merge(a, l, m, r): | |
i, j = l, m | |
res = [] | |
while i < m and j < r: | |
if a[i] < a[j]: | |
res.append(a[i]) | |
i += 1 | |
else: | |
res.append(a[j]) | |
j += 1 | |
while i < m: | |
res.append(a[i]) | |
i += 1 | |
while j < r: | |
res.append(a[j]) | |
j += 1 | |
a[l:r] = res | |
def inmem_mergesort(a, l, r): | |
if (r - 1) <= l: | |
return | |
m = (r - l) // 2 + l | |
inmem_mergesort(a, l, m) | |
inmem_mergesort(a, m, r) | |
inmem_merge(a, l, m, r) | |
if __name__ == '__main__': | |
s = list(stream_file(create_file(50e6))) | |
tstart = time.time() | |
res = mergesort(s, chunk_size=int(1e6)) | |
print('File based mergesort time (min): {}'.format((time.time() - tstart) / 60)) | |
last = 0 | |
for i in res: | |
assert last <= i | |
last = i | |
print('File based merge sort passed test') | |
# s = list(stream_file(create_file(50e6))) | |
random.shuffle(s) | |
tstart = time.time() | |
inmem_mergesort(s, 0, len(s)) | |
print('Mergesort time (min): {}'.format((time.time() - tstart) / 60)) | |
last = 0 | |
for i in s: | |
assert last <= i | |
last = i | |
print('Merge sort passed test') | |
random.shuffle(s) | |
tstart = time.time() | |
sorted(s) | |
print('Sorted time (min): {}'.format((time.time() - tstart) / 60)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment