Skip to content

Instantly share code, notes, and snippets.

@myaut
Created April 9, 2015 15:12
Show Gist options
  • Save myaut/d52380c56f0a1624d762 to your computer and use it in GitHub Desktop.
Save myaut/d52380c56f0a1624d762 to your computer and use it in GitHub Desktop.
Multiprocess grep in Python
from multiprocessing import Pool
from itertools import chain
import re
import sys
PROC_COUNT = 8
LINES_PER_PROC = 40
if len(sys.argv) != 3:
print >> sys.stderr, "usage: mplog.py RE FILE"
sys.exit(1)
regex = re.compile(sys.argv[1])
def match(lines):
global regex
pred = lambda line: regex.match(line) is not None
return list(map(pred, lines))
def chunks(l, n):
for i in xrange(0, len(l), n):
yield l[i:i+n]
if __name__ == '__main__':
pool = Pool(PROC_COUNT)
with open(sys.argv[2]) as f:
lines = []
last_mb = 0
for l1 in f:
l2 = f.next()
if not l2:
break
if len(lines) < LINES_PER_PROC * PROC_COUNT:
lines.append(l1 + l2)
continue
matches = pool.map(match, chunks(lines, LINES_PER_PROC))
if any(chain(*matches)):
map(sys.stdout.write, lines)
mb = f.tell() / (1024 * 1024)
if (mb - last_mb) > 20:
print >> sys.stderr, "Processed {} MB".format(mb)
last_mb = mb
lines = []
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment