Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save po5i/b7c1f7230277bdb4aa5ce68c66b37eed to your computer and use it in GitHub Desktop.
Save po5i/b7c1f7230277bdb4aa5ce68c66b37eed to your computer and use it in GitHub Desktop.
Find strings with non-latin characters in files
import sys
import os
import codecs
import unicodedata as ud
# from http://stackoverflow.com/questions/3094498/how-can-i-check-if-a-python-unicode-string-contains-non-western-letters
latin_letters = {}
def is_latin(uchr):
try:
return latin_letters[uchr]
except KeyError:
return latin_letters.setdefault(uchr, 'LATIN' in ud.name(uchr))
def only_roman_chars(unistr):
return all(is_latin(uchr)
for uchr in unistr
if uchr.isalpha()) # isalpha suggested by John Machin
path = sys.argv[1]
for dirpath, dirnames, filenames in os.walk(path):
for fn in filenames:
full_path = os.path.join(dirpath, fn)
try:
with codecs.open(full_path, encoding='utf-8') as f:
for line in f:
if not only_roman_chars(line):
print full_path, ':'
print line
except UnicodeDecodeError:
continue
except:
print 'Error while reading', full_path
raise
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment