Skip to content

Instantly share code, notes, and snippets.

@pawelszydlo
Last active May 24, 2020 18:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pawelszydlo/936d9f2cf15d04ab80a0705d1a1bef93 to your computer and use it in GitHub Desktop.
Save pawelszydlo/936d9f2cf15d04ab80a0705d1a1bef93 to your computer and use it in GitHub Desktop.
Detect text file encoding and convert to UTF-8
#! /usr/bin/env python3
# Script to auto-detect file encoding and convert it to UTF-8.
# Uses cchardet which provides very good detection.
#
# Newest version can always be found at:
# https://gist.github.com/pawelszydlo/936d9f2cf15d04ab80a0705d1a1bef93
import cchardet
import os
import sys
def convert_encoding(file_name, new_encoding):
"""Converts data in file_name to new_encoding."""
if not os.path.isfile(file_name):
print('"%s" is not a file.' % file_name)
return
# Skip files larger than 1MB.
if os.path.getsize(file_name) > 1024 * 1024:
print('"%s" is too big.' % file_name)
return
data = open(file_name, 'rb').read()
encoding = cchardet.detect(data)['encoding']
if not encoding:
print('Couldn\'t detect encoding for "%s".' % file_name)
return
print('Detected "%s" for file "%s" ...' % (encoding, file_name))
if new_encoding.upper() != encoding.upper():
print('... converting to "%s".' % new_encoding)
data = data.decode(encoding, errors='replace').encode(new_encoding)
open(file_name, 'wb').write(data)
else:
print('... not doing anything.')
if __name__ == '__main__':
if len(sys.argv) < 2:
print('Pass filenames as parameters.')
sys.exit()
for file_name in sys.argv[1:]:
convert_encoding(file_name, 'UTF-8')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment