Created
April 10, 2013 04:46
-
-
Save gcorreaq/5351871 to your computer and use it in GitHub Desktop.
Script for taking random lines from a file (with repetition), and put them in separate files, one per process worker.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import multiprocessing | |
import random | |
import clint | |
def count_lines(path): | |
with open(path, 'r') as file_r: | |
line_number = 0 | |
for line_number, line in enumerate(file_r, start=1): | |
pass | |
return line_number | |
def random_line(file_handler): | |
line = next(file_handler) | |
for line_number, other_line in enumerate(file_handler): | |
if random.randrange(line_number + 2): | |
continue | |
line = other_line | |
return line | |
def perform(input_file, output_file, line_qty): | |
with open(input_file, 'r') as file_r: | |
with open(output_file, 'w') as file_w: | |
for iteration in xrange(line_qty): | |
file_w.write(random_line(file_r)) | |
file_r.seek(0) | |
def main(): | |
filename = clint.args.get(0) | |
percentage = float(clint.args.get(1)) | |
parts = multiprocessing.cpu_count() | |
pool = multiprocessing.Pool(maxtasksperchild=1) | |
total_lines = int(count_lines(filename) * percentage) | |
lines_per_worker = int(total_lines / parts) | |
for part in xrange(parts): | |
pool.apply_async( | |
perform, | |
(filename, | |
"sample_{0}_{1}".format(part, filename), | |
lines_per_worker)) | |
pool.close() | |
print 'Waiting the workers to finish' | |
pool.join() | |
print 'Process finished!' | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment