avalonv/unicode2ascii.py

## unicode2ascii.py
#!/usr/bin/python3
# this will TRY to replace a set of unicode characters with a corresponding set
# of usa-ascii ones. requires unidecode (https://github.com/avian2/unidecode),
# run 'pip3 install unidecode' to install.
# useful for translating mathematical symbols commonly found in PDFs into
# plain ascii ones, NOT recommended for transliterating text that doesn't use
# the latin alphabet, though it can still spot instances of that text for you.
# there's no gurantee the replacements it suggests will be accurate, so you
# should carefully inspect the suggestion for each individual line before
# writing.
# This code is licensed under the terms of the GNU General Public License v3
# gwyn oscuro 2022
from unidecode import unidecode as cleanse
from sys import argv, exit

# symbols to explicitly ignore (append to end of string, don't use commas)
ignore = '−'

# any symbols with unicode codes higher than this value will be replaced.
# set the cutoff on a case by case basis, start with a high value value and
# decrease it accordingly to filter more stuff, there's no universal solution
# ex: 8320 will ignore things like em dashes and quotes, ideal for book quotes
cutoff = 8230

if len(argv) > 1:
    file = argv[1]
else:
    print('Please supply a file')
    exit(1)

with open(file, newline='\n', mode='r', encoding='utf8') as f:
    rlines = f.readlines()
    width = len(str(len(rlines)))

grace = [ord(c) for c in ignore]
grace.extend(range(0,cutoff+1))
converts = []
impenitents = []
targets = []
total = 0

for i, line in enumerate(rlines):
    line = line.strip()
    sins = ''.join((c for c in line if not ord(c) in grace))
    if any(sins):
        total += 1
        repenters = cleanse(sins)
        fmt_unicd = "{:>{width}} | unicode: '{}'"
        fmt_ascii = "{:>{width}} |   ascii: '{}'"
        if len(sins) == len(repenters):
            converts.append(sins)
            targets.append(i)
            print(fmt_unicd.format(i+1, sins, width=width))
            print(fmt_ascii.format(' ', repenters, width=width))
        else:
            impenitents.append(i+1)
            print("{:>{width}} | WARNING: char count mismatch, discarding".
                                                  format(i+1, width=width))
            print(fmt_unicd.format(' ', sins, width=width))
            print(fmt_ascii.format(' ', repenters, width=width))

print("########################################")
print("-Total offending lines:", total)
if any(impenitents):
    impenitents = ', '.join((str(n) for n in impenitents))
    print("-Please manually check lines", impenitents)
if any(converts):
    converts = list(set(''.join(converts)))
    converts.sort()
    print("-Offending characters:")
    print(converts)
    print("-Replace these characters with ascii?")
    if input(">type YES to proceed, anything else to quit: ").lower() == 'yes':
        wlines = rlines
        for i, line in enumerate(rlines):
            if i in targets:
                for c in converts:
                    if c in line:
                        line = line.replace(c, cleanse(c))
                wlines[i] = line
        with open(file, newline='\n', mode='w', encoding='utf8') as f:
            f.writelines(wlines)
    else:
        print("-Aborting")
        exit(0)
	#!/usr/bin/python3
	# this will TRY to replace a set of unicode characters with a corresponding set
	# of usa-ascii ones. requires unidecode (https://github.com/avian2/unidecode),
	# run 'pip3 install unidecode' to install.
	# useful for translating mathematical symbols commonly found in PDFs into
	# plain ascii ones, NOT recommended for transliterating text that doesn't use
	# the latin alphabet, though it can still spot instances of that text for you.
	# there's no gurantee the replacements it suggests will be accurate, so you
	# should carefully inspect the suggestion for each individual line before
	# writing.
	# This code is licensed under the terms of the GNU General Public License v3
	# gwyn oscuro 2022
	from unidecode import unidecode as cleanse
	from sys import argv, exit

	# symbols to explicitly ignore (append to end of string, don't use commas)
	ignore = '−'

	# any symbols with unicode codes higher than this value will be replaced.
	# set the cutoff on a case by case basis, start with a high value value and
	# decrease it accordingly to filter more stuff, there's no universal solution
	# ex: 8320 will ignore things like em dashes and quotes, ideal for book quotes
	cutoff = 8230

	if len(argv) > 1:
	file = argv[1]
	else:
	print('Please supply a file')
	exit(1)

	with open(file, newline='\n', mode='r', encoding='utf8') as f:
	rlines = f.readlines()
	width = len(str(len(rlines)))

	grace = [ord(c) for c in ignore]
	grace.extend(range(0,cutoff+1))
	converts = []
	impenitents = []
	targets = []
	total = 0

	for i, line in enumerate(rlines):
	line = line.strip()
	sins = ''.join((c for c in line if not ord(c) in grace))
	if any(sins):
	total += 1
	repenters = cleanse(sins)
	fmt_unicd = "{:>{width}} \| unicode: '{}'"
	fmt_ascii = "{:>{width}} \| ascii: '{}'"
	if len(sins) == len(repenters):
	converts.append(sins)
	targets.append(i)
	print(fmt_unicd.format(i+1, sins, width=width))
	print(fmt_ascii.format(' ', repenters, width=width))
	else:
	impenitents.append(i+1)
	print("{:>{width}} \| WARNING: char count mismatch, discarding".
	format(i+1, width=width))
	print(fmt_unicd.format(' ', sins, width=width))
	print(fmt_ascii.format(' ', repenters, width=width))

	print("########################################")
	print("-Total offending lines:", total)
	if any(impenitents):
	impenitents = ', '.join((str(n) for n in impenitents))
	print("-Please manually check lines", impenitents)
	if any(converts):
	converts = list(set(''.join(converts)))
	converts.sort()
	print("-Offending characters:")
	print(converts)
	print("-Replace these characters with ascii?")
	if input(">type YES to proceed, anything else to quit: ").lower() == 'yes':
	wlines = rlines
	for i, line in enumerate(rlines):
	if i in targets:
	for c in converts:
	if c in line:
	line = line.replace(c, cleanse(c))
	wlines[i] = line
	with open(file, newline='\n', mode='w', encoding='utf8') as f:
	f.writelines(wlines)
	else:
	print("-Aborting")
	exit(0)