Created
March 25, 2018 00:21
-
-
Save tamsanh/32bb2a3a279a217465664bff0360d109 to your computer and use it in GitHub Desktop.
Count words in all files in a given directory, either recursively or non-recursively.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import io | |
import os | |
import csv | |
from glob import glob | |
DEFAULT_OUTPUT_NAME = 'word-counts.csv' | |
def get_files_in_dir(target_dir, recursive=False): | |
if recursive: | |
file_paths = [os.path.join(root, name) | |
for root, dirs, files in os.walk(target_dir) | |
for name in files] | |
else: | |
file_paths = [os.path.join(target_dir, x) for x in os.listdir(target_dir)] | |
return file_paths | |
def count_word_in_file(word, file_path): | |
try: | |
with io.open(file_path, 'r', encoding='UTF8') as f: | |
return len(re.findall(word, f.read())) | |
except (IOError, UnicodeDecodeError): | |
print("Failed to parse File(%s)" % file_path) | |
return 0 | |
def count_word_in_all_files(word, file_path_list): | |
counts = {} | |
for file_path in file_path_list: | |
word_count = count_word_in_file(word, file_path) | |
counts[file_path] = word_count | |
return counts | |
def output_word_counts_csv(word, file_word_counts, output_file_name): | |
with open(output_file_name, 'wb+') as csvfile: | |
word_count_writer = csv.DictWriter(csvfile, fieldnames=['filename', 'wordcount', 'word'], delimiter=',') | |
word_count_writer.writeheader() | |
for filename, word_count in file_word_counts.items(): | |
word_count_writer.writerow({'filename': filename.encode('utf8'), 'wordcount': str(word_count).encode('utf8'), 'word': word.encode('utf8')}) | |
print("Output to %s" % output_file_name) | |
def main(word, target_dir, recursive=False, output_file_name=DEFAULT_OUTPUT_NAME): | |
files_to_check = get_files_in_dir(target_dir, recursive=recursive) | |
file_word_counts = count_word_in_all_files(word, files_to_check) | |
output_word_counts_csv(word, file_word_counts, output_file_name) | |
if __name__ == "__main__": | |
# https://gist.github.com/tamsanh/17f6e135bdcdcccb5f877531ca909ad6 | |
import argparse | |
import sys | |
class HelpDefaultParser(argparse.ArgumentParser): | |
def error(self, message): | |
sys.stderr.write('error: %s\n' % message) | |
self.print_help() | |
sys.exit(2) | |
argparser = HelpDefaultParser(description="Counts the words in all files in a given glob") | |
argparser.add_argument('WORD', help="The target word to look for.") | |
argparser.add_argument('DIRECTORY', help="The target directory path to find all the files.") | |
argparser.add_argument('-r', '--recursive', action="store_true", help="Recursively scan directories for files", default=False) | |
argparser.add_argument('-o', '--output', help="The name of the file to be outputted", default=DEFAULT_OUTPUT_NAME) | |
args = argparser.parse_args() | |
main(args.WORD, args.DIRECTORY, args.recursive, args.output) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment