Skip to content

Instantly share code, notes, and snippets.

@avalonv
Last active September 30, 2022 19:38
Show Gist options
  • Save avalonv/33874f2e4b841394488d556eb65fc604 to your computer and use it in GitHub Desktop.
Save avalonv/33874f2e4b841394488d556eb65fc604 to your computer and use it in GitHub Desktop.
Replace unicode symbols with ascii ones
#!/usr/bin/python3
# this will TRY to replace a set of unicode characters with a corresponding set
# of usa-ascii ones. requires unidecode (https://github.com/avian2/unidecode),
# run 'pip3 install unidecode' to install.
# useful for translating mathematical symbols commonly found in PDFs into
# plain ascii ones, NOT recommended for transliterating text that doesn't use
# the latin alphabet, though it can still spot instances of that text for you.
# there's no gurantee the replacements it suggests will be accurate, so you
# should carefully inspect the suggestion for each individual line before
# writing.
# This code is licensed under the terms of the GNU General Public License v3
# gwyn oscuro 2022
from unidecode import unidecode as cleanse
from sys import argv, exit
# symbols to explicitly ignore (append to end of string, don't use commas)
ignore = '−'
# any symbols with unicode codes higher than this value will be replaced.
# set the cutoff on a case by case basis, start with a high value value and
# decrease it accordingly to filter more stuff, there's no universal solution
# ex: 8320 will ignore things like em dashes and quotes, ideal for book quotes
cutoff = 8230
if len(argv) > 1:
file = argv[1]
else:
print('Please supply a file')
exit(1)
with open(file, newline='\n', mode='r', encoding='utf8') as f:
rlines = f.readlines()
width = len(str(len(rlines)))
grace = [ord(c) for c in ignore]
grace.extend(range(0,cutoff+1))
converts = []
impenitents = []
targets = []
total = 0
for i, line in enumerate(rlines):
line = line.strip()
sins = ''.join((c for c in line if not ord(c) in grace))
if any(sins):
total += 1
repenters = cleanse(sins)
fmt_unicd = "{:>{width}} | unicode: '{}'"
fmt_ascii = "{:>{width}} | ascii: '{}'"
if len(sins) == len(repenters):
converts.append(sins)
targets.append(i)
print(fmt_unicd.format(i+1, sins, width=width))
print(fmt_ascii.format(' ', repenters, width=width))
else:
impenitents.append(i+1)
print("{:>{width}} | WARNING: char count mismatch, discarding".
format(i+1, width=width))
print(fmt_unicd.format(' ', sins, width=width))
print(fmt_ascii.format(' ', repenters, width=width))
print("########################################")
print("-Total offending lines:", total)
if any(impenitents):
impenitents = ', '.join((str(n) for n in impenitents))
print("-Please manually check lines", impenitents)
if any(converts):
converts = list(set(''.join(converts)))
converts.sort()
print("-Offending characters:")
print(converts)
print("-Replace these characters with ascii?")
if input(">type YES to proceed, anything else to quit: ").lower() == 'yes':
wlines = rlines
for i, line in enumerate(rlines):
if i in targets:
for c in converts:
if c in line:
line = line.replace(c, cleanse(c))
wlines[i] = line
with open(file, newline='\n', mode='w', encoding='utf8') as f:
f.writelines(wlines)
else:
print("-Aborting")
exit(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment