Skip to content

Instantly share code, notes, and snippets.

@gcorreaq
Created April 10, 2013 04:46
Show Gist options
  • Save gcorreaq/5351871 to your computer and use it in GitHub Desktop.
Save gcorreaq/5351871 to your computer and use it in GitHub Desktop.
Script for taking random lines from a file (with repetition), and put them in separate files, one per process worker.
import multiprocessing
import random
import clint
def count_lines(path):
with open(path, 'r') as file_r:
line_number = 0
for line_number, line in enumerate(file_r, start=1):
pass
return line_number
def random_line(file_handler):
line = next(file_handler)
for line_number, other_line in enumerate(file_handler):
if random.randrange(line_number + 2):
continue
line = other_line
return line
def perform(input_file, output_file, line_qty):
with open(input_file, 'r') as file_r:
with open(output_file, 'w') as file_w:
for iteration in xrange(line_qty):
file_w.write(random_line(file_r))
file_r.seek(0)
def main():
filename = clint.args.get(0)
percentage = float(clint.args.get(1))
parts = multiprocessing.cpu_count()
pool = multiprocessing.Pool(maxtasksperchild=1)
total_lines = int(count_lines(filename) * percentage)
lines_per_worker = int(total_lines / parts)
for part in xrange(parts):
pool.apply_async(
perform,
(filename,
"sample_{0}_{1}".format(part, filename),
lines_per_worker))
pool.close()
print 'Waiting the workers to finish'
pool.join()
print 'Process finished!'
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment