Skip to content

Instantly share code, notes, and snippets.

@msoxzw
Last active December 13, 2019 08:06
Show Gist options
  • Save msoxzw/705cc161729ff46208543c45af738fdf to your computer and use it in GitHub Desktop.
Save msoxzw/705cc161729ff46208543c45af738fdf to your computer and use it in GitHub Desktop.
automatcially convert any character encoding to UTF-8, line ending to '\n'
import glob
import sys
from concurrent.futures import ProcessPoolExecutor
from itertools import chain
from cchardet import UniversalDetector
detector = UniversalDetector()
def detect(file):
detector.reset()
with detector:
for line in file:
detector.feed(line)
if detector.done:
break
return detector.result
def main(file):
try:
with open(file, 'rb+') as f:
encoding = detect(f)['encoding']
except (FileNotFoundError, PermissionError):
return
if encoding:
with open(file, encoding=encoding) as f:
data = f.read()
with open(file, 'w', encoding='utf-8', newline='') as f:
f.write(data)
if __name__ == '__main__':
files = chain.from_iterable(map(glob.glob, sys.argv[1:]))
with ProcessPoolExecutor() as executor:
executor.map(main, files)
@msoxzw
Copy link
Author

msoxzw commented Dec 13, 2019

Considerably improve performance

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment