public
Created

  • Download Gist
saveTopK.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
import heapq
 
def saveTopK(k, log_fn, output_fn):
min_heap = []
with open(log_fn, 'r') as log:
for line in log:
entry = line.strip()
values = entry.split() # split by spaces
 
# start and end time are the last two values in an entry
 
# convert a string like '23.04.05' to [23, 4, 5]
start = [int(x) for x in values[-2].split('.')]
end = [int(x) for x in values[-1].split('.')]
 
# convert to seconds
start_seconds = start[0] * 60 * 60 + start[1] * 60 + start[2]
end_seconds = end[0] * 60 * 60 + end[1] * 60 + end[2]
 
time = end_seconds - start_seconds
 
# store the total time and the original entry as a tuple
# comparison within the heap is done based on the first position:
# http://stackoverflow.com/questions/5292303/python-tuple-comparison
item = (time, entry)
 
if len(min_heap) < k:
heapq.heappush(min_heap, item)
else:
if time > min_heap[0][0]:
heapq.heapreplace(min_heap, item)
 
with open(output_fn, 'w') as output:
# get heap elements as a sorted list and output to disk
for item in heapq.nlargest(k, min_heap):
output.write(item[1] + '\n')
# or in python3: print(item[1], file=output)

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.