Skip to content

Instantly share code, notes, and snippets.

@hobbsh
Created January 18, 2019 22:07
Show Gist options
  • Save hobbsh/797306d8df44d6df5cee1894acc07b6b to your computer and use it in GitHub Desktop.
Save hobbsh/797306d8df44d6df5cee1894acc07b6b to your computer and use it in GitHub Desktop.
Sort arbitrary sized file filled with random integers by N biggest
#!/usr/bin/env python
import random
import sys
import threading, Queue
from heapq import nlargest
import os
# job queue
queue = Queue.Queue()
# result queue
result = []
FILE = 'nums.txt'
# From: http://effbot.org/zone/wide-finder.htm
class Worker(threading.Thread):
def run(self):
while 1:
args = queue.get()
if args is None:
break
result.append(process(*args))
queue.task_done()
# Sort chunk by largest number and return it
def process(file, chunk):
f = open(file, "rb")
f.seek(chunk[0])
chunk_list = f.read(chunk[1]).split('\n')
largest = max(chunk_list)
return largest
# Chunk input file into 1M chunks - increase size for bigger files
def getchunks(fname, size=1024*1024):
f = open(fname)
while 1:
start = f.tell()
f.seek(size, 1)
s = f.readline()
yield start, f.tell() - start
if not s:
break
#Seed a new nums.txt
def seed(nums):
with open(FILE, 'w+') as f:
for i in range(0,nums):
line = str(random.randint(1, 99999999999999))
f.write(line+'\n')
def sort():
try:
num_biggest = int(sys.argv[1])
except ValueError:
print('Biggest number value must be a valid integer: i.e. python nums.py 100')
sys.exit(1)
# Create 4 worker threads
for i in range(4):
w = Worker()
w.setDaemon(1)
w.start()
# Generate chunks of file and push them to queue for workers to process
for chunk in getchunks(FILE):
queue.put((FILE, chunk))
# Start processing
queue.join()
#Sort result list by N biggest numbers
biggest_nums = nlargest(num_biggest, result)
for num in biggest_nums:
print(num)
if __name__ == '__main__':
#seed(1000000000) #seed 100 million ints
sort()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment