leonpanokarren/grepword-m.py

## grepword-m.py
import os
import sys
import argparse
import multiprocessing
import queue

BLOCK_SIZE = 8000

def parse_options():
	parser = argparse.ArgumentParser(description="find(grep for) a word in argument files")
	parser.add_argument("--count", type=int, default=3, dest="count",
		help="number of workers to use")
	parser.add_argument("--debug", dest="debug", action='store_true',
		help="include worker number in output")
	parser.add_argument("--recurse", dest="recurse", action='store_true',
		help="flag to indicate if 'file' arguments should be iteratively searched for other files")
	parser.add_argument("word", type=str, help="the word to search for")
	parser.add_argument("files", type=str, help="a file/directory to search in", metavar="file_dir", nargs="+")
	args = parser.parse_args()
	return args, args.word, args.files

def get_files(args, recurse):
	files = []
	for item in args:
		if os.path.isfile(item):
			files.append(item)
		elif os.path.isdir(item) and recurse:
			for root, dir, sub_files in os.walk(item):
				files.extend([os.path.join(root, file) for file in sub_files])
		elif os.path.isdir(item):
			for entry in os.listdir(item):
				if os.path.isfile(entry):
					files.append(os.path.join(item, entry))
	return files

def search(word, work_queue, results_queue):
	while True:
		try:
			filename = work_queue.get(block=False)
			previous = ""
			with open(filename, "rb") as f:
				while True:
					current = f.read(BLOCK_SIZE)
					if not current:
						break
					current = current.decode("utf-8", "ignore")
					if (word in current or
						word in previous[-len(word):] + current[:len(word)]):
						results_queue.put(filename)
						break
					if len(current) != BLOCK_SIZE:
						break
					previous = current
		except queue.Empty as e:
			break

if __name__ == '__main__':
	opts, word, args = parse_options()
	filelist = get_files(args, opts.recurse)

	work_queue = multiprocessing.Queue()
	results_queue = multiprocessing.Queue()

	# load all files on the queue
	for file in filelist:
		work_queue.put(file)

	# start the workers with refs to word, work_queue and results_queue
	workers = []
	for i in range(opts.count):
		worker = multiprocessing.Process(target=search, args=[word, work_queue, results_queue])
		workers.append(worker)
		worker.daemon = True
		worker.start()

	# wait for all workers to complete
	for worker in workers:
		worker.join()

	# collect and present results
	results = []
	while True:
		try:
			results.append(results_queue.get(block=False))
		except queue.Empty as e:
			break
	print(chr(10).join(sorted(list(results))))
	import os
	import sys
	import argparse
	import multiprocessing
	import queue

	BLOCK_SIZE = 8000

	def parse_options():
	parser = argparse.ArgumentParser(description="find(grep for) a word in argument files")
	parser.add_argument("--count", type=int, default=3, dest="count",
	help="number of workers to use")
	parser.add_argument("--debug", dest="debug", action='store_true',
	help="include worker number in output")
	parser.add_argument("--recurse", dest="recurse", action='store_true',
	help="flag to indicate if 'file' arguments should be iteratively searched for other files")
	parser.add_argument("word", type=str, help="the word to search for")
	parser.add_argument("files", type=str, help="a file/directory to search in", metavar="file_dir", nargs="+")
	args = parser.parse_args()
	return args, args.word, args.files

	def get_files(args, recurse):
	files = []
	for item in args:
	if os.path.isfile(item):
	files.append(item)
	elif os.path.isdir(item) and recurse:
	for root, dir, sub_files in os.walk(item):
	files.extend([os.path.join(root, file) for file in sub_files])
	elif os.path.isdir(item):
	for entry in os.listdir(item):
	if os.path.isfile(entry):
	files.append(os.path.join(item, entry))
	return files

	def search(word, work_queue, results_queue):
	while True:
	try:
	filename = work_queue.get(block=False)
	previous = ""
	with open(filename, "rb") as f:
	while True:
	current = f.read(BLOCK_SIZE)
	if not current:
	break
	current = current.decode("utf-8", "ignore")
	if (word in current or
	word in previous[-len(word):] + current[:len(word)]):
	results_queue.put(filename)
	break
	if len(current) != BLOCK_SIZE:
	break
	previous = current
	except queue.Empty as e:
	break

	if __name__ == '__main__':
	opts, word, args = parse_options()
	filelist = get_files(args, opts.recurse)

	work_queue = multiprocessing.Queue()
	results_queue = multiprocessing.Queue()

	# load all files on the queue
	for file in filelist:
	work_queue.put(file)

	# start the workers with refs to word, work_queue and results_queue
	workers = []
	for i in range(opts.count):
	worker = multiprocessing.Process(target=search, args=[word, work_queue, results_queue])
	workers.append(worker)
	worker.daemon = True
	worker.start()

	# wait for all workers to complete
	for worker in workers:
	worker.join()

	# collect and present results
	results = []
	while True:
	try:
	results.append(results_queue.get(block=False))
	except queue.Empty as e:
	break
	print(chr(10).join(sorted(list(results))))