Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save BlaayLock/e496d4341a820cb28629a7d38757a3f3 to your computer and use it in GitHub Desktop.
Save BlaayLock/e496d4341a820cb28629a7d38757a3f3 to your computer and use it in GitHub Desktop.
Python: Convert Unicode to ASCII without errors, utf8 -> cp1251
#! python2
# //coding: utf-8
# coding=utf-8
# -*- coding: utf-8 -*-
# vim: set fileencoding=utf-8 :
a = u'1a'
a = a.decode('utf-8').encode('cp1251')
print a
a = u'1\u0430'
a = a.encode('UTF-8').decode('UTF-8').encode('cp1251')
print a
#~ Не нужно путать юникод и UTF-8 !!! utf-8, как и cp1251 с точки зрения питона2.* - массив байт.
#~ Часто парюсь, думая что UTF-8 - это unicode. Так же, чтобы не возникало путаницы, рекомендую в начале файла указывать кодировку исходного кода в виде волшебного комментария.
#~ Подробно py-my.ru/post/4bfb3c6a1d41c846bc00009b
msg=u'1\u0430'
if len(str(msg))!==0
if isinstance(msg, unicode):
#~ print type(msg)
#~ print encodings(get_codepage(msg))
print msg.encode('UTF-8').decode('UTF-8').encode('cp1251')
else:
if get_codepage(msg)=='UTF-8':
print msg.decode('UTF-8').encode('cp1251')
else:
print msg
encodings = {
'UTF-8': 'utf-8',
'CP1251': 'windows-1251',
'KOI8-R': 'koi8-r',
'IBM866': 'ibm866',
'ISO-8859-5': 'iso-8859-5',
'MAC': 'mac',
}
def get_codepage(str = None):
uppercase = 1
lowercase = 3
utfupper = 5
utflower = 7
codepages = {}
for enc in encodings.keys():
codepages[enc] = 0
if str is not None and len(str) > 0:
last_simb = 0
for simb in str:
simb_ord = ord(simb)
"""non-russian characters"""
if simb_ord < 128 or simb_ord > 256:
continue
"""UTF-8"""
if last_simb == 208 and (143 < simb_ord < 176 or simb_ord == 129):
codepages['UTF-8'] += (utfupper * 2)
if (last_simb == 208 and (simb_ord == 145 or 175 < simb_ord < 192)) \
or (last_simb == 209 and (127 < simb_ord < 144)):
codepages['UTF-8'] += (utflower * 2)
"""CP1251"""
if 223 < simb_ord < 256 or simb_ord == 184:
codepages['CP1251'] += lowercase
if 191 < simb_ord < 224 or simb_ord == 168:
codepages['CP1251'] += uppercase
"""KOI8-R"""
if 191 < simb_ord < 224 or simb_ord == 163:
codepages['KOI8-R'] += lowercase
if 222 < simb_ord < 256 or simb_ord == 179:
codepages['KOI8-R'] += uppercase
"""IBM866"""
if 159 < simb_ord < 176 or 223 < simb_ord < 241:
codepages['IBM866'] += lowercase
if 127 < simb_ord < 160 or simb_ord == 241:
codepages['IBM866'] += uppercase
"""ISO-8859-5"""
if 207 < simb_ord < 240 or simb_ord == 161:
codepages['ISO-8859-5'] += lowercase
if 175 < simb_ord < 208 or simb_ord == 241:
codepages['ISO-8859-5'] += uppercase
"""MAC"""
if 221 < simb_ord < 255:
codepages['MAC'] += lowercase
if 127 < simb_ord < 160:
codepages['MAC'] += uppercase
last_simb = simb_ord
idx = ''
max = 0
for item in codepages:
if codepages[item] > max:
max = codepages[item]
idx = item
return idx
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment