Skip to content

Instantly share code, notes, and snippets.

@messyidea
Last active June 27, 2018 08:25
Show Gist options
  • Save messyidea/6baf736b2d2dd05ec590c1f3e69bb995 to your computer and use it in GitHub Desktop.
Save messyidea/6baf736b2d2dd05ec590c1f3e69bb995 to your computer and use it in GitHub Desktop.
ConvertToUtf8
# -*- coding: utf-8 -*-
import os
import sys
import codecs
import threading
import json
import time
import hashlib
import shutil
from chardet.universaldetector import UniversalDetector
class ConvertHandler(object):
def __init__(self):
self.confidence = 0.95
self.max_detect_lines = 600
def _detect(self, file_name, cnt):
if not file_name or not os.path.exists(file_name) or os.path.getsize(file_name) == 0:
return "", False
detector = UniversalDetector()
fp = open(file_name, 'rb')
for line in fp:
# cut MS-Windows CR code
line = line.replace(b'\r',b'')
detector.feed(line)
cnt -= 1
if detector.done or cnt == 0:
break
fp.close()
detector.close()
encoding = detector.result['encoding']
if encoding:
encoding = encoding.upper()
confidence = detector.result['confidence']
# result = 'Detected {0} with {1} confidence'.format(encoding, confidence)
# print(result)
return encoding, confidence > 0.95
def _convert(self, file_name, encoding, to_encoding):
if encoding == to_encoding:
# print("same encoding, ignore")
return
fp = None
try:
fp = codecs.open(file_name, 'rb', encoding, errors='strict')
contents = fp.read()
contents = contents.replace('\r\n', '\n').replace('\r', '\n')
contents = contents.encode(to_encoding)
except LookupError as e:
print("LookupError")
return
except UnicodeDecodeError as e:
print("UnicodeDecodeError")
return
except UnicodeEncodeError as e:
print(file_name + ":UnicodeEncodeError")
return
finally:
if fp:
fp.close()
print('{2}: {0} -> {1}'.format(encoding, to_encoding, file_name))
with open(file_name, 'wb') as f:
f.write(contents)
def convert_file(self, file_name, to_encoding):
encoding, confidence = self._detect(file_name, self.max_detect_lines)
if confidence:
self._convert(file_name, encoding, to_encoding)
def convert_dir(self, dir, to_encoding='UTF-8'):
for fpathe, dirs, fs in os.walk(dir):
# 是否有必要跳过某些隐藏目录?
for f in fs:
path_str = os.path.join(fpathe, f)
if path_str.endswith("ConvertToUTF8.py"):
continue
self.convert_file(path_str, to_encoding)
if __name__ == '__main__':
handler = ConvertHandler()
handler.convert_dir(".", "UTF-8")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment