Last active
June 27, 2018 08:25
-
-
Save messyidea/6baf736b2d2dd05ec590c1f3e69bb995 to your computer and use it in GitHub Desktop.
ConvertToUtf8
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import os | |
import sys | |
import codecs | |
import threading | |
import json | |
import time | |
import hashlib | |
import shutil | |
from chardet.universaldetector import UniversalDetector | |
class ConvertHandler(object): | |
def __init__(self): | |
self.confidence = 0.95 | |
self.max_detect_lines = 600 | |
def _detect(self, file_name, cnt): | |
if not file_name or not os.path.exists(file_name) or os.path.getsize(file_name) == 0: | |
return "", False | |
detector = UniversalDetector() | |
fp = open(file_name, 'rb') | |
for line in fp: | |
# cut MS-Windows CR code | |
line = line.replace(b'\r',b'') | |
detector.feed(line) | |
cnt -= 1 | |
if detector.done or cnt == 0: | |
break | |
fp.close() | |
detector.close() | |
encoding = detector.result['encoding'] | |
if encoding: | |
encoding = encoding.upper() | |
confidence = detector.result['confidence'] | |
# result = 'Detected {0} with {1} confidence'.format(encoding, confidence) | |
# print(result) | |
return encoding, confidence > 0.95 | |
def _convert(self, file_name, encoding, to_encoding): | |
if encoding == to_encoding: | |
# print("same encoding, ignore") | |
return | |
fp = None | |
try: | |
fp = codecs.open(file_name, 'rb', encoding, errors='strict') | |
contents = fp.read() | |
contents = contents.replace('\r\n', '\n').replace('\r', '\n') | |
contents = contents.encode(to_encoding) | |
except LookupError as e: | |
print("LookupError") | |
return | |
except UnicodeDecodeError as e: | |
print("UnicodeDecodeError") | |
return | |
except UnicodeEncodeError as e: | |
print(file_name + ":UnicodeEncodeError") | |
return | |
finally: | |
if fp: | |
fp.close() | |
print('{2}: {0} -> {1}'.format(encoding, to_encoding, file_name)) | |
with open(file_name, 'wb') as f: | |
f.write(contents) | |
def convert_file(self, file_name, to_encoding): | |
encoding, confidence = self._detect(file_name, self.max_detect_lines) | |
if confidence: | |
self._convert(file_name, encoding, to_encoding) | |
def convert_dir(self, dir, to_encoding='UTF-8'): | |
for fpathe, dirs, fs in os.walk(dir): | |
# 是否有必要跳过某些隐藏目录? | |
for f in fs: | |
path_str = os.path.join(fpathe, f) | |
if path_str.endswith("ConvertToUTF8.py"): | |
continue | |
self.convert_file(path_str, to_encoding) | |
if __name__ == '__main__': | |
handler = ConvertHandler() | |
handler.convert_dir(".", "UTF-8") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment