Skip to content

Instantly share code, notes, and snippets.

@bertsky
Created April 2, 2024 11:39
Show Gist options
  • Save bertsky/8c50d98a8ee8babdfc03ecc1da686c5a to your computer and use it in GitHub Desktop.
Save bertsky/8c50d98a8ee8babdfc03ecc1da686c5a to your computer and use it in GitHub Desktop.
Aggregate character histogram for the given text files
#!/usr/bin/env python3
import argparse
import os
import sys
import io
from functools import reduce
import json
import unicodedata
# Command line arguments.
arg_parser = argparse.ArgumentParser(description='Aggregate character histogram for the given text files.')
arg_parser.add_argument("path", help="file or directory of text file(s)", nargs='*')
arg_parser.add_argument("-f", "--format", help="output format", choices=["json", "csv"], default="csv")
arg_parser.add_argument("-o", "--order", help="output order", choices=["char", "freq"], default="freq")
arg_parser.add_argument("-v", "--verbose", help="show Unicode codepoint names", action="store_true")
arg_parser.add_argument("-n", "--normalization", help="Unicode normalization form", choices=["", "NFC", "NFKC", "NFD", "NFKD"], default="")
args = arg_parser.parse_args()
def update(histogram, char):
if char in histogram:
histogram[char] += 1
else:
histogram[char] = 1
return histogram
def normalize(text):
if not args.normalization:
return text
return unicodedata.normalize(args.normalization, text)
# Read all files and overwrite them with normalized text if necessary.
histogram = {}
for path in args.path:
if os.path.isdir(path):
for filepath in os.listdir(path):
with io.open(os.path.join(path, filepath), "r", encoding="utf-8") as file:
text = file.read()
text = normalize(text)
histogram = reduce(update, text, histogram)
else:
with sys.stdin if path == '-' else io.open(path, "r", encoding="utf-8") as file:
text = file.read()
text = normalize(text)
histogram = reduce(update, text, histogram)
def sort(charfreq):
if args.order == 'char':
return charfreq[0]
if args.order == 'freq':
return charfreq[1]
histogram = dict(sorted(histogram.items(), key=sort))
total = sum(freq for char, freq in histogram.items())
if args.format == 'json':
print(json.dumps(histogram, indent=2))
exit
print("char\tfreq#\tfreq%\tchar name\n")
for char, freq in histogram.items():
if args.verbose:
try:
name = unicodedata.name(char)
except ValueError:
name = "unmapped Unicode char " + repr(char)
else:
name = ""
print(f"{char}\t{freq}\t{freq/total*100:2.4f}\t{name}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment