Skip to content

Instantly share code, notes, and snippets.

@yymm
Last active October 26, 2017 02:10
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yymm/6021be23059cac04ef0a996feb79c249 to your computer and use it in GitHub Desktop.
Save yymm/6021be23059cac04ef0a996feb79c249 to your computer and use it in GitHub Desktop.
Encoding and newline checker (required chardet)
import os
from chardet.universaldetector import UniversalDetector
def check_encode(file_path):
detector = UniversalDetector()
with open(file_path, mode='rb') as f:
for binary in f:
detector.feed(binary)
if detector.done:
break
detector.close()
return detector.result['encoding'], detector.result['confidence']
def check_newline(file_path):
crlf = 0
lf = 0
lines = 0
with open(file_path, mode='rb') as f:
for binary in f:
lines += 1
if binary[-2:] == b'\r\n':
crlf += 1
elif binary[-1:] == b'\n':
lf += 1
if lines == 0:
return 'LF' # empty file jidge LF
if lines == crlf or lines-1 == crlf:
return 'CR+LF'
elif lines == lf or lines-1 == lf:
return 'LF'
return 'Mixed(CR+LF & LF)'
def check_all_files(files, encode, newline, threshold=0.95):
exit_code = 0
for f in files:
if os.path.isdir(f):
continue # directory
enc, con = check_encode(f)
if enc == None:
continue # binary file
code = check_newline(f)
message = ''
if enc != encode and enc != 'ascii':
message = ' [not ' + encode + '] ' + enc
if con < threshold:
message = ' [low confidence] ' + str(con)
if code != newline:
mesage = ' [not ' + newline + '] ' + code
if len(message) != 0:
print(f, message)
exit_code = 1
return exit_code
def convert_newline(file_path, lf=True):
"""convert newline
lf=True => CR+LF to LF (default)
lf=False => LF to CR+LF
"""
data = b''
with open(file_path, 'rb') as f:
if lf:
data = f.read().replace(b'\r', b'')
else:
data = f.read().replace(b'\n', b'\r\n')
with open(file_path, 'wb') as f:
f.write(data)
def convert_all_files(files, origin, target):
for f in files:
if os.path.isdir(f):
continue # directory
data = ''
with open(f, 'rb') as fobj:
data = fobj.read()
o = data.decode(origin)
t = o.encode(target)
with open(f, 'wb') as fobj:
fobj.write(t)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment