Skip to content

Instantly share code, notes, and snippets.

@utkarshkukreti
Created May 31, 2011 09:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save utkarshkukreti/1000245 to your computer and use it in GitHub Desktop.
Save utkarshkukreti/1000245 to your computer and use it in GitHub Desktop.
#! /usr/bin/env python
# vim: set fileencoding=utf-8
# (c) Uwe Kleine-König <ukleine@strlen.de>
# GPLv2
# Modified by Utkarsh Kukreti <utkarshkukreti@gmail.com> - 2011/05/31
import locale
import sys
f = file(sys.argv[1])
data = f.read()
def len_utf8_char(data, start):
def check_cont(num):
if all(map(lambda c: ord(c) >= 0x80 and ord(c) <= 0xbf, data[start+1:start+num])):
return num
else:
return -1
if ord(data[start]) < 128:
# ASCII char
return 1
elif ord(data[start]) & 0xe0 == 0xc0:
return check_cont(2)
elif ord(data[start]) & 0xf0 == 0xe0:
return check_cont(3)
elif ord(data[start]) & 0xf8 == 0xf0:
return check_cont(4)
elif ord(data[start]) & 0xfc == 0xf8:
return check_cont(5)
elif ord(data[start]) & 0xfe == 0xfc:
return check_cont(6)
i = 0
maxl = 0
while i < len(data):
l = len_utf8_char(data, i)
if l < 0:
prefenc = locale.getpreferredencoding()
if prefenc not in ('UTF-8', 'ANSI_X3.4-1968'):
print prefenc
else:
print 'ISO-8859-1'
sys.exit(0)
if maxl < l:
maxl = l
i += l
if maxl > 1:
print 'UTF-8'
else:
print 'ANSI_X3.4-1968'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment