Created
June 27, 2018 09:11
-
-
Save messyidea/9d2dd3ac644f9ed9347143389ecd1628 to your computer and use it in GitHub Desktop.
ConvertTool.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import os | |
import sys | |
import codecs | |
import threading | |
import json | |
import time | |
import hashlib | |
import shutil | |
import argparse | |
from chardet.universaldetector import UniversalDetector | |
class ConvertHandler(object): | |
def __init__(self, confidence=0.95): | |
self.confidence = confidence | |
self.max_detect_lines = 600 | |
def _detect(self, file_name, cnt): | |
if not file_name or not os.path.exists(file_name) or os.path.getsize(file_name) == 0: | |
return "", False | |
detector = UniversalDetector() | |
fp = open(file_name, 'rb') | |
for line in fp: | |
# cut MS-Windows CR code | |
line = line.replace(b'\r',b'') | |
detector.feed(line) | |
cnt -= 1 | |
if detector.done or cnt == 0: | |
break | |
fp.close() | |
detector.close() | |
encoding = detector.result['encoding'] | |
if encoding: | |
encoding = encoding.upper() | |
confidence = detector.result['confidence'] | |
result = '{2}: Detected {0} with {1} confidence'.format(encoding, confidence, file_name) | |
if args.verbose: | |
print(result) | |
return encoding, confidence > self.confidence | |
def _convert(self, file_name, encoding, to_encoding): | |
if encoding == to_encoding: | |
# print("same encoding, ignore") | |
return | |
fp = None | |
try: | |
fp = codecs.open(file_name, 'rb', encoding, errors='strict') | |
contents = fp.read() | |
contents = contents.replace('\r\n', '\n').replace('\r', '\n') | |
contents = contents.encode(to_encoding) | |
except LookupError as e: | |
print(file_name + ":LookupError") | |
return | |
except UnicodeDecodeError as e: | |
print(file_name + ":UnicodeDecodeError") | |
return | |
except UnicodeEncodeError as e: | |
print(file_name + ":UnicodeEncodeError") | |
return | |
finally: | |
if fp: | |
fp.close() | |
if args.verbose: | |
print('{2}: {0} -> {1}'.format(encoding, to_encoding, file_name)) | |
with open(file_name, 'wb') as f: | |
f.write(contents) | |
def convert_file(self, file_name, to_encoding): | |
encoding, confidence = self._detect(file_name, self.max_detect_lines) | |
if confidence: | |
self._convert(file_name, encoding, to_encoding) | |
def convert_dir(self, dir, to_encoding='UTF-8'): | |
for fpathe, dirs, fs in os.walk(dir): | |
# 是否有必要跳过某些隐藏目录? | |
for f in fs: | |
path_str = os.path.join(fpathe, f) | |
if path_str.endswith("ConvertTool.py"): | |
continue | |
self.convert_file(path_str, to_encoding) | |
if not args.handle_all: | |
return | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description="Convert your file to some other encodings") | |
parser.add_argument('--verbose', '-v', action='store_true', help='verbose mode') | |
parser.add_argument('--path', '-p', action="store", dest="path", required=True, type=str, help="path") | |
parser.add_argument('--to', '-t', action="store", dest="to_encoding", default='UTF-8', type=str, help="to encoding") | |
parser.add_argument('--all', '-r', action='store_true', dest="handle_all", help='handler all file in the subdir') | |
parser.add_argument('--confidence', '-c', action="store", dest="confidence", default=0.95, type=float, help="confidence") | |
global args | |
args = parser.parse_args() | |
handler = ConvertHandler(args.confidence) | |
handler.convert_dir(args.path, args.to_encoding) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment