Skip to content

Instantly share code, notes, and snippets.

@okumura
Created November 24, 2019 22:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save okumura/fbe6a9051a3f20438a5202a40c687f2a to your computer and use it in GitHub Desktop.
Save okumura/fbe6a9051a3f20438a5202a40c687f2a to your computer and use it in GitHub Desktop.
wc -l
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import multiprocessing
import os
import sys
import time
import threading
def get_chunk_line_count((name, start, stop, blocksize)):
left = stop - start
def blocks(f, left):
while left > 0:
b = f.read(min(left, blocksize))
if b:
yield b
else:
break
left -= len(b)
with open(name, 'r') as f:
f.seek(start)
return sum(bl.count('\n') for bl in blocks(f, left))
def get_file_offset_ranges(name, blocksize=65536, m=1):
fsize = os.stat(name).st_size
chunksize = (fsize // multiprocessing.cpu_count()) * m
n = fsize // chunksize
ranges = []
for i in range(0, n * chunksize, chunksize):
ranges.append((name, i, i + chunksize, blocksize))
if fsize % chunksize != 0:
ranges.append((name, ranges[-1][2], fsize, blocksize))
return ranges
def wc_mp_pool(name, blocksize=65536):
ranges = get_file_offset_ranges(name, blocksize)
pool = multiprocessing.Pool(processes=len(ranges))
pool_outputs = pool.map(get_chunk_line_count, ranges)
pool.close()
pool.join()
return sum(pool_outputs)
print(wc_mp_pool(sys.argv[1]))
@adrianodemarino
Copy link

Consider changing the function argument of get_chunk_line_count() in:

def get_chunk_line_count(ranges):
    name, start, stop, blocksize = ranges
    left = stop - start

    def blocks(f, left):
        while left > 0:
            b = f.read(min(left, blocksize))
            if b:
                yield b
            else:
                break
            left -= len(b)

    with open(name, 'r') as f:
        f.seek(start)
        return sum(bl.count('\n') for bl in blocks(f, left))

Thank you for sharing.

Adriano

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment