Skip to content

Instantly share code, notes, and snippets.

@joshuashaffer
Created March 8, 2016 02:32
Show Gist options
  • Save joshuashaffer/9493dc124024de6afe7e to your computer and use it in GitHub Desktop.
Save joshuashaffer/9493dc124024de6afe7e to your computer and use it in GitHub Desktop.
Figure out all possible alternative encodings for a unicode string.
# coding=utf-8
import pkgutil
import encodings
# List of possible codecs.
# Multibyte unicode is unlikely. Along with other...
unlikely_codecs = {"bz2_codec", "uu_codec", "base64_codec", "hex_codec", "zlib_codec", "unicode_escape",
"unicode_internal", "utf_16", "utf_16_le", "utf_16_be", "utf_32", "utf_32_be", "utf_32_le",
"raw_unicode_escape","punycode","palmos","idna"}
found = set(name for imp, name, ispkg in pkgutil.iter_modules(encodings.__path__) if not ispkg)
found.difference_update(unlikely_codecs)
def find_unique_encodings(in_word):
"""
Return all possible alternative encodings for a given string.
:param in_word: utf_string
:return: set of all possible encodings minus the unlikely_codecs
In [6]: find_unique_encodings(u"Limón")
Out[6]:
{'Lim\x1b$(D+Q\x1b(Bn',
'Lim\x1b$(O)i\x1b(Bn',
'Lim\x1b$(Q)i\x1b(Bn',
'Lim+APM-n',
'Lim~{(.~}n',
'Lim\x85\x89n',
'Lim\x88un',
'Lim\x8f\xab\xd1n',
'Lim\x97n',
'Lim\xa2n',
'Lim\xa8\xaen',
'Lim\xa9\xe9n',
'Lim\xc3\xb3n',
'Lim\xc6n',
'Lim\xf3n',
'Yvz\xf3a',
'\xd3\x89\x94\xce\x95',
'\xef\xbb\xbfLim\xc3\xb3n'}
"""
out_encodings = set()
for encoding in found:
# Bad
try:
# print "{} {}".format(encoding, in_word.encode(encoding))
out_encodings.add(in_word.encode(encoding))
except:
pass
return out_encodings
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment