Created
April 8, 2018 22:25
-
-
Save leonpanokarren/0c310c0760c186a4c5df2e3af0a7a422 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import argparse | |
import multiprocessing | |
import queue | |
BLOCK_SIZE = 8000 | |
def parse_options(): | |
parser = argparse.ArgumentParser(description="find(grep for) a word in argument files") | |
parser.add_argument("--count", type=int, default=3, dest="count", | |
help="number of workers to use") | |
parser.add_argument("--debug", dest="debug", action='store_true', | |
help="include worker number in output") | |
parser.add_argument("--recurse", dest="recurse", action='store_true', | |
help="flag to indicate if 'file' arguments should be iteratively searched for other files") | |
parser.add_argument("word", type=str, help="the word to search for") | |
parser.add_argument("files", type=str, help="a file/directory to search in", metavar="file_dir", nargs="+") | |
args = parser.parse_args() | |
return args, args.word, args.files | |
def get_files(args, recurse): | |
files = [] | |
for item in args: | |
if os.path.isfile(item): | |
files.append(item) | |
elif os.path.isdir(item) and recurse: | |
for root, dir, sub_files in os.walk(item): | |
files.extend([os.path.join(root, file) for file in sub_files]) | |
elif os.path.isdir(item): | |
for entry in os.listdir(item): | |
if os.path.isfile(entry): | |
files.append(os.path.join(item, entry)) | |
return files | |
def search(word, work_queue, results_queue): | |
while True: | |
try: | |
filename = work_queue.get(block=False) | |
previous = "" | |
with open(filename, "rb") as f: | |
while True: | |
current = f.read(BLOCK_SIZE) | |
if not current: | |
break | |
current = current.decode("utf-8", "ignore") | |
if (word in current or | |
word in previous[-len(word):] + current[:len(word)]): | |
results_queue.put(filename) | |
break | |
if len(current) != BLOCK_SIZE: | |
break | |
previous = current | |
except queue.Empty as e: | |
break | |
if __name__ == '__main__': | |
opts, word, args = parse_options() | |
filelist = get_files(args, opts.recurse) | |
work_queue = multiprocessing.Queue() | |
results_queue = multiprocessing.Queue() | |
# load all files on the queue | |
for file in filelist: | |
work_queue.put(file) | |
# start the workers with refs to word, work_queue and results_queue | |
workers = [] | |
for i in range(opts.count): | |
worker = multiprocessing.Process(target=search, args=[word, work_queue, results_queue]) | |
workers.append(worker) | |
worker.daemon = True | |
worker.start() | |
# wait for all workers to complete | |
for worker in workers: | |
worker.join() | |
# collect and present results | |
results = [] | |
while True: | |
try: | |
results.append(results_queue.get(block=False)) | |
except queue.Empty as e: | |
break | |
print(chr(10).join(sorted(list(results)))) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment