Skip to content

Instantly share code, notes, and snippets.

@tamsanh
Created March 25, 2018 00:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tamsanh/32bb2a3a279a217465664bff0360d109 to your computer and use it in GitHub Desktop.
Save tamsanh/32bb2a3a279a217465664bff0360d109 to your computer and use it in GitHub Desktop.
Count words in all files in a given directory, either recursively or non-recursively.
import re
import io
import os
import csv
from glob import glob
DEFAULT_OUTPUT_NAME = 'word-counts.csv'
def get_files_in_dir(target_dir, recursive=False):
if recursive:
file_paths = [os.path.join(root, name)
for root, dirs, files in os.walk(target_dir)
for name in files]
else:
file_paths = [os.path.join(target_dir, x) for x in os.listdir(target_dir)]
return file_paths
def count_word_in_file(word, file_path):
try:
with io.open(file_path, 'r', encoding='UTF8') as f:
return len(re.findall(word, f.read()))
except (IOError, UnicodeDecodeError):
print("Failed to parse File(%s)" % file_path)
return 0
def count_word_in_all_files(word, file_path_list):
counts = {}
for file_path in file_path_list:
word_count = count_word_in_file(word, file_path)
counts[file_path] = word_count
return counts
def output_word_counts_csv(word, file_word_counts, output_file_name):
with open(output_file_name, 'wb+') as csvfile:
word_count_writer = csv.DictWriter(csvfile, fieldnames=['filename', 'wordcount', 'word'], delimiter=',')
word_count_writer.writeheader()
for filename, word_count in file_word_counts.items():
word_count_writer.writerow({'filename': filename.encode('utf8'), 'wordcount': str(word_count).encode('utf8'), 'word': word.encode('utf8')})
print("Output to %s" % output_file_name)
def main(word, target_dir, recursive=False, output_file_name=DEFAULT_OUTPUT_NAME):
files_to_check = get_files_in_dir(target_dir, recursive=recursive)
file_word_counts = count_word_in_all_files(word, files_to_check)
output_word_counts_csv(word, file_word_counts, output_file_name)
if __name__ == "__main__":
# https://gist.github.com/tamsanh/17f6e135bdcdcccb5f877531ca909ad6
import argparse
import sys
class HelpDefaultParser(argparse.ArgumentParser):
def error(self, message):
sys.stderr.write('error: %s\n' % message)
self.print_help()
sys.exit(2)
argparser = HelpDefaultParser(description="Counts the words in all files in a given glob")
argparser.add_argument('WORD', help="The target word to look for.")
argparser.add_argument('DIRECTORY', help="The target directory path to find all the files.")
argparser.add_argument('-r', '--recursive', action="store_true", help="Recursively scan directories for files", default=False)
argparser.add_argument('-o', '--output', help="The name of the file to be outputted", default=DEFAULT_OUTPUT_NAME)
args = argparser.parse_args()
main(args.WORD, args.DIRECTORY, args.recursive, args.output)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment