messyidea/ConvertToUtf8.py

## ConvertToUtf8.py
# -*- coding: utf-8 -*-
import os
import sys
import codecs
import threading
import json
import time
import hashlib
import shutil
from chardet.universaldetector import UniversalDetector

class ConvertHandler(object):
	def __init__(self):
		self.confidence = 0.95
		self.max_detect_lines = 600

	def _detect(self, file_name, cnt):
		if not file_name or not os.path.exists(file_name) or os.path.getsize(file_name) == 0:
			return "", False
		detector = UniversalDetector()
		fp = open(file_name, 'rb')
		for line in fp:
			# cut MS-Windows CR code
			line = line.replace(b'\r',b'')
			detector.feed(line)
			cnt -= 1
			if detector.done or cnt == 0:
				break
		fp.close()
		detector.close()
		encoding = detector.result['encoding']
		if encoding:
			encoding = encoding.upper()
		confidence = detector.result['confidence']

		# result = 'Detected {0} with {1} confidence'.format(encoding, confidence)
		# print(result)

		return encoding, confidence > 0.95

	def _convert(self, file_name, encoding, to_encoding):
		if encoding == to_encoding:
			# print("same encoding, ignore")
			return

		fp = None
		try:
			fp = codecs.open(file_name, 'rb', encoding, errors='strict')
			contents = fp.read()
			contents = contents.replace('\r\n', '\n').replace('\r', '\n')
			contents = contents.encode(to_encoding)
		except LookupError as e:
			print("LookupError")
			return
		except UnicodeDecodeError as e:
			print("UnicodeDecodeError")
			return
		except UnicodeEncodeError as e:
			print(file_name + ":UnicodeEncodeError")
			return
		finally:
			if fp:
				fp.close()

		print('{2}: {0} -> {1}'.format(encoding, to_encoding, file_name))

		with open(file_name, 'wb') as f:
			f.write(contents)

	def convert_file(self, file_name, to_encoding):
		encoding, confidence = self._detect(file_name, self.max_detect_lines)
		if confidence:
			self._convert(file_name, encoding, to_encoding)

	def convert_dir(self, dir, to_encoding='UTF-8'):
		for fpathe, dirs, fs in os.walk(dir):
			# 是否有必要跳过某些隐藏目录？
			for f in fs:
				path_str = os.path.join(fpathe, f)
				if path_str.endswith("ConvertToUTF8.py"):
					continue
				self.convert_file(path_str, to_encoding)


if __name__ == '__main__':
	handler = ConvertHandler()
	handler.convert_dir(".", "UTF-8")
	# -- coding: utf-8 --
	import os
	import sys
	import codecs
	import threading
	import json
	import time
	import hashlib
	import shutil
	from chardet.universaldetector import UniversalDetector

	class ConvertHandler(object):
	def __init__(self):
	self.confidence = 0.95
	self.max_detect_lines = 600

	def _detect(self, file_name, cnt):
	if not file_name or not os.path.exists(file_name) or os.path.getsize(file_name) == 0:
	return "", False
	detector = UniversalDetector()
	fp = open(file_name, 'rb')
	for line in fp:
	# cut MS-Windows CR code
	line = line.replace(b'\r',b'')
	detector.feed(line)
	cnt -= 1
	if detector.done or cnt == 0:
	break
	fp.close()
	detector.close()
	encoding = detector.result['encoding']
	if encoding:
	encoding = encoding.upper()
	confidence = detector.result['confidence']

	# result = 'Detected {0} with {1} confidence'.format(encoding, confidence)
	# print(result)

	return encoding, confidence > 0.95

	def _convert(self, file_name, encoding, to_encoding):
	if encoding == to_encoding:
	# print("same encoding, ignore")
	return

	fp = None
	try:
	fp = codecs.open(file_name, 'rb', encoding, errors='strict')
	contents = fp.read()
	contents = contents.replace('\r\n', '\n').replace('\r', '\n')
	contents = contents.encode(to_encoding)
	except LookupError as e:
	print("LookupError")
	return
	except UnicodeDecodeError as e:
	print("UnicodeDecodeError")
	return
	except UnicodeEncodeError as e:
	print(file_name + ":UnicodeEncodeError")
	return
	finally:
	if fp:
	fp.close()

	print('{2}: {0} -> {1}'.format(encoding, to_encoding, file_name))

	with open(file_name, 'wb') as f:
	f.write(contents)

	def convert_file(self, file_name, to_encoding):
	encoding, confidence = self._detect(file_name, self.max_detect_lines)
	if confidence:
	self._convert(file_name, encoding, to_encoding)

	def convert_dir(self, dir, to_encoding='UTF-8'):
	for fpathe, dirs, fs in os.walk(dir):
	# 是否有必要跳过某些隐藏目录？
	for f in fs:
	path_str = os.path.join(fpathe, f)
	if path_str.endswith("ConvertToUTF8.py"):
	continue
	self.convert_file(path_str, to_encoding)



	if __name__ == '__main__':
	handler = ConvertHandler()
	handler.convert_dir(".", "UTF-8")