Skip to content

Instantly share code, notes, and snippets.

@burnash
Last active May 23, 2023 22:47
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save burnash/d6d35fbabd2566f2b1b9 to your computer and use it in GitHub Desktop.
Save burnash/d6d35fbabd2566f2b1b9 to your computer and use it in GitHub Desktop.
Find strings with non-latin characters in files
import sys
import os
import codecs
import unicodedata as ud
# from http://stackoverflow.com/questions/3094498/how-can-i-check-if-a-python-unicode-string-contains-non-western-letters
latin_letters = {}
def is_latin(uchr):
try:
return latin_letters[uchr]
except KeyError:
return latin_letters.setdefault(uchr, 'LATIN' in ud.name(uchr))
def only_roman_chars(unistr):
return all(is_latin(uchr)
for uchr in unistr
if uchr.isalpha()) # isalpha suggested by John Machin
path = sys.argv[1]
for dirpath, dirnames, filenames in os.walk(path):
for fn in filenames:
full_path = os.path.join(dirpath, fn)
try:
with codecs.open(full_path, encoding='utf-8') as f:
for line in f:
if not only_roman_chars(line):
print full_path, ':'
print line
except UnicodeDecodeError:
continue
except:
print 'Error while reading', full_path
raise
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment