Skip to content

Instantly share code, notes, and snippets.

@youfou
Last active May 24, 2019 10:58
Show Gist options
  • Save youfou/8c36923a972727049991a0a9a88857f6 to your computer and use it in GitHub Desktop.
Save youfou/8c36923a972727049991a0a9a88857f6 to your computer and use it in GitHub Desktop.
a script for statistic word frequency
#!/usr/bin/env python3
# coding: utf-8
import os
import logging
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
__title__ = 'word-freq'
__version__ = '0.1.0'
__author__ = 'Youfou'
def get_arg_parser():
import argparse
ap = argparse.ArgumentParser(
description='Count word frequencies in text files.')
ap.add_argument(
'input_path', type=str, nargs='*',
help='files or dirs to scan (required)')
ap.add_argument(
'-r', '--recur', action='store_true', default=False,
help='recur files in sub folders (default: parent folder only)'
)
ap.add_argument(
'-m', '--min_chars', type=int, default=2,
help='specify min characters as a word (default: 2)'
)
ap.add_argument(
'-o', '--output', type=str, default='.', metavar='dir',
help='specify which dir to save results (default: current working dir)'
)
ap.add_argument(
'-v', '--version', action='store_true',
help='show version and exit')
return ap
def load_content(path, recur=False):
paths = list()
if os.path.isfile(path):
paths.append(path)
elif os.path.isdir(path):
for root, dirs, files in os.walk(path):
for name in files:
if name.endswith('.txt') or name.endswith('.text'):
paths.append(os.path.join(root, name))
if not recur:
break
texts = list()
for _path in paths:
logging.info('Loading: {}'.format(_path))
with open(_path, errors='replace') as fp:
texts.append(fp.read())
return '\n'.join(texts)
def count_word_freq(text, min_chars=2):
import re
from collections import Counter
counter = Counter()
for match in re.finditer(r'(?:\w|[A-Z]\.){' + str(min_chars) + r',}', text):
word = match.group()
if re.search(r'[a-z]', word, re.I):
counter.update([word.lower()])
return counter
def save(data, path):
from openpyxl import Workbook
wb = Workbook()
ws = wb.active
for row in data.most_common():
ws.append(row)
wb.save(path)
if __name__ == '__main__':
arg_parser = get_arg_parser()
args = arg_parser.parse_args()
if args.input_path:
if not os.path.isdir(args.output):
os.makedirs(args.output)
logging.info('Created: {}'.format(args.output))
for input_path in args.input_path:
content = load_content(input_path, args.recur)
logging.info('Counting: {}'.format(input_path))
freq = count_word_freq(content, args.min_chars)
output_path = '{}.xlsx'.format(os.path.join(
args.output,
os.path.splitext(os.path.split(input_path)[1])[0]
))
logging.info('Saving as: {}'.format(output_path))
save(freq, output_path)
elif args.version:
print('{} {} by {}'.format(__title__, __version__, __author__))
else:
arg_parser.print_help()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment