Skip to content

Instantly share code, notes, and snippets.

@messyidea
Created June 27, 2018 09:11
Show Gist options
  • Save messyidea/9d2dd3ac644f9ed9347143389ecd1628 to your computer and use it in GitHub Desktop.
Save messyidea/9d2dd3ac644f9ed9347143389ecd1628 to your computer and use it in GitHub Desktop.
ConvertTool.py
# -*- coding: utf-8 -*-
import os
import sys
import codecs
import threading
import json
import time
import hashlib
import shutil
import argparse
from chardet.universaldetector import UniversalDetector
class ConvertHandler(object):
def __init__(self, confidence=0.95):
self.confidence = confidence
self.max_detect_lines = 600
def _detect(self, file_name, cnt):
if not file_name or not os.path.exists(file_name) or os.path.getsize(file_name) == 0:
return "", False
detector = UniversalDetector()
fp = open(file_name, 'rb')
for line in fp:
# cut MS-Windows CR code
line = line.replace(b'\r',b'')
detector.feed(line)
cnt -= 1
if detector.done or cnt == 0:
break
fp.close()
detector.close()
encoding = detector.result['encoding']
if encoding:
encoding = encoding.upper()
confidence = detector.result['confidence']
result = '{2}: Detected {0} with {1} confidence'.format(encoding, confidence, file_name)
if args.verbose:
print(result)
return encoding, confidence > self.confidence
def _convert(self, file_name, encoding, to_encoding):
if encoding == to_encoding:
# print("same encoding, ignore")
return
fp = None
try:
fp = codecs.open(file_name, 'rb', encoding, errors='strict')
contents = fp.read()
contents = contents.replace('\r\n', '\n').replace('\r', '\n')
contents = contents.encode(to_encoding)
except LookupError as e:
print(file_name + ":LookupError")
return
except UnicodeDecodeError as e:
print(file_name + ":UnicodeDecodeError")
return
except UnicodeEncodeError as e:
print(file_name + ":UnicodeEncodeError")
return
finally:
if fp:
fp.close()
if args.verbose:
print('{2}: {0} -> {1}'.format(encoding, to_encoding, file_name))
with open(file_name, 'wb') as f:
f.write(contents)
def convert_file(self, file_name, to_encoding):
encoding, confidence = self._detect(file_name, self.max_detect_lines)
if confidence:
self._convert(file_name, encoding, to_encoding)
def convert_dir(self, dir, to_encoding='UTF-8'):
for fpathe, dirs, fs in os.walk(dir):
# 是否有必要跳过某些隐藏目录?
for f in fs:
path_str = os.path.join(fpathe, f)
if path_str.endswith("ConvertTool.py"):
continue
self.convert_file(path_str, to_encoding)
if not args.handle_all:
return
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Convert your file to some other encodings")
parser.add_argument('--verbose', '-v', action='store_true', help='verbose mode')
parser.add_argument('--path', '-p', action="store", dest="path", required=True, type=str, help="path")
parser.add_argument('--to', '-t', action="store", dest="to_encoding", default='UTF-8', type=str, help="to encoding")
parser.add_argument('--all', '-r', action='store_true', dest="handle_all", help='handler all file in the subdir')
parser.add_argument('--confidence', '-c', action="store", dest="confidence", default=0.95, type=float, help="confidence")
global args
args = parser.parse_args()
handler = ConvertHandler(args.confidence)
handler.convert_dir(args.path, args.to_encoding)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment