Skip to content

Instantly share code, notes, and snippets.

@leonpanokarren
Created April 8, 2018 22:25
Show Gist options
  • Save leonpanokarren/0c310c0760c186a4c5df2e3af0a7a422 to your computer and use it in GitHub Desktop.
Save leonpanokarren/0c310c0760c186a4c5df2e3af0a7a422 to your computer and use it in GitHub Desktop.
import os
import sys
import argparse
import multiprocessing
import queue
BLOCK_SIZE = 8000
def parse_options():
parser = argparse.ArgumentParser(description="find(grep for) a word in argument files")
parser.add_argument("--count", type=int, default=3, dest="count",
help="number of workers to use")
parser.add_argument("--debug", dest="debug", action='store_true',
help="include worker number in output")
parser.add_argument("--recurse", dest="recurse", action='store_true',
help="flag to indicate if 'file' arguments should be iteratively searched for other files")
parser.add_argument("word", type=str, help="the word to search for")
parser.add_argument("files", type=str, help="a file/directory to search in", metavar="file_dir", nargs="+")
args = parser.parse_args()
return args, args.word, args.files
def get_files(args, recurse):
files = []
for item in args:
if os.path.isfile(item):
files.append(item)
elif os.path.isdir(item) and recurse:
for root, dir, sub_files in os.walk(item):
files.extend([os.path.join(root, file) for file in sub_files])
elif os.path.isdir(item):
for entry in os.listdir(item):
if os.path.isfile(entry):
files.append(os.path.join(item, entry))
return files
def search(word, work_queue, results_queue):
while True:
try:
filename = work_queue.get(block=False)
previous = ""
with open(filename, "rb") as f:
while True:
current = f.read(BLOCK_SIZE)
if not current:
break
current = current.decode("utf-8", "ignore")
if (word in current or
word in previous[-len(word):] + current[:len(word)]):
results_queue.put(filename)
break
if len(current) != BLOCK_SIZE:
break
previous = current
except queue.Empty as e:
break
if __name__ == '__main__':
opts, word, args = parse_options()
filelist = get_files(args, opts.recurse)
work_queue = multiprocessing.Queue()
results_queue = multiprocessing.Queue()
# load all files on the queue
for file in filelist:
work_queue.put(file)
# start the workers with refs to word, work_queue and results_queue
workers = []
for i in range(opts.count):
worker = multiprocessing.Process(target=search, args=[word, work_queue, results_queue])
workers.append(worker)
worker.daemon = True
worker.start()
# wait for all workers to complete
for worker in workers:
worker.join()
# collect and present results
results = []
while True:
try:
results.append(results_queue.get(block=False))
except queue.Empty as e:
break
print(chr(10).join(sorted(list(results))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment