Skip to content
Create a gist now

Instantly share code, notes, and snippets.

Unicode to ASCII / UTF-8 converter for Python dicts, lists, strings and nested combinations of dicts, lists and strings
def asciify(data):
Asciifies strings, lists and dicts, and nested versions of same
The JSON spec ( -- "JSON text SHALL
be encoded in Unicode". For apps that don't use unicode, this function
walks through all levels of a JSON data structure and converts each item
to ASCII. See for original.
Can be used for any nesting of strings / lists / dicts, e.g. a list of
dicts, a dict in which values are lists of strings etc. See LIMITATIONS.
data A string, unicode, list or dict, or nested versions of the
same types. Typically the string output from json.dumps()
or the dict resulting from json.load() or json.loads().
A Python dictionary with all keys and values converted to UTF-8.
There are several equivalent ways to use this function.
(1) asciify string version of data structure before creating dict:
s = json.dumps(x)
d = json.loads(asciify(s))
(2) create dict from string version of data structure, then asciify:
s = json.dumps(x)
d = json.loads(s)
d = asciify(d)
(3) asciify as the dict is being created via object hook:
s = json.dumps(x)
d = json.loads(s, object_hook=asciify)
Asciifying the string first (approach (1) above) is probably the best
approach since the input is a flat string and there's no possibility of
the depth traversal stopping due to an unknown type. See LIMITATIONS.
>>> import json
>>> s1 = 'ASCII string'
>>> type(s1)
<type 'str'>
>>> s1 = asciify(s1)
>>> type(s1)
<type 'str'>
>>> s2 = u'Unicode string'
>>> type(s2)
<type 'unicode'>
>>> s2 = asciify(s2)
>>> type(s2)
<type 'str'>
>>> s3 = 'Nestl'+unichr(0xe9)
>>> print asciify(s3)
>>> asciify(['a','b','c'])
['a', 'b', 'c']
>>> asciify([u'a',u'b',u'c'])
['a', 'b', 'c']
>>> asciify({'a':'aa','b':'bb','c':'cc'})
{'a': 'aa', 'c': 'cc', 'b': 'bb'}
>>> asciify({u'a':'aa','b':u'bb',u'c':u'cc'})
{'a': 'aa', 'c': 'cc', 'b': 'bb'}
>>> d = dict(a='a1',b='b2',c=dict(d='d3',e=['e4','e5','e6'],f=dict(g='g7')),h=[8,9,10])
>>> print d
{'a': 'a1', 'h': [8, 9, 10], 'c': {'e': ['e4', 'e5', 'e6'], 'd': 'd3', 'f': {'g': 'g7'}}, 'b': 'b2'}
>>> print type(d)
<type 'dict'>
>>> asciistr = json.dumps(d)
>>> print asciistr
{"a": "a1", "h": [8, 9, 10], "c": {"e": ["e4", "e5", "e6"], "d": "d3", "f": {"g": "g7"}}, "b": "b2"}
>>> print type(asciistr)
<type 'str'>
>>> unidict = json.loads(asciistr)
>>> print unidict
{u'a': u'a1', u'h': [8, 9, 10], u'c': {u'e': [u'e4', u'e5', u'e6'], u'd': u'd3', u'f': {u'g': u'g7'}}, u'b': u'b2'}
>>> print type(unidict)
<type 'dict'>
>>> unidict == d
>>> asciidict1 = asciify(unidict)
>>> print asciidict1
{'a': 'a1', 'h': [8, 9, 10], 'c': {'e': ['e4', 'e5', 'e6'], 'd': 'd3', 'f': {'g': 'g7'}}, 'b': 'b2'}
>>> print type(asciidict1)
<type 'dict'>
>>> asciidict1 == d
>>> asciidict2 = json.loads(asciistr, object_hook=asciify)
>>> print asciidict2
{'a': 'a1', 'h': [8, 9, 10], 'c': {'e': ['e4', 'e5', 'e6'], 'd': 'd3', 'f': {'g': 'g7'}}, 'b': 'b2'}
>>> print type(asciidict2)
<type 'dict'>
>>> asciidict2 == d
For a multi-layered data structure (dict of lists, list of strings etc.)
depth traversal of the data structure stops when the element encountered
is not a string, unicode, list or dict. For example, in this dict:
> d = {'a': { 'b': [1, 2, set(u'x', u'y'] ), 'c': u'z' } }
... the u'x' and u'y' items are contained within a set, and therefore
would not be asciified, while u'z' is contained in a dict and would be
asciified since the breadth traversal of the structure continues.
A future @@todo could be to throw an error if a non-traversable input
is used, or have additional parameter that can allow the non-traversable
input to be used even though the result is a partial discard of data.
## embedded functions
## see
def _remove_accents(data):
Changes accented letters to non-accented approximation, like Nestle
return unicodedata.normalize('NFKD', data).encode('ascii', 'ignore')
def _asciify_list(data):
""" Ascii-fies list values """
ret = []
for item in data:
if isinstance(item, unicode):
item = _remove_accents(item)
item = item.encode('utf-8')
elif isinstance(item, list):
item = _asciify_list(item)
elif isinstance(item, dict):
item = _asciify_dict(item)
return ret
def _asciify_dict(data):
""" Ascii-fies dict keys and values """
ret = {}
for key, value in data.iteritems():
if isinstance(key, unicode):
key = _remove_accents(key)
key = key.encode('utf-8')
## note new if
if isinstance(value, unicode):
value = _remove_accents(value)
value = value.encode('utf-8')
elif isinstance(value, list):
value = _asciify_list(value)
elif isinstance(value, dict):
value = _asciify_dict(value)
ret[key] = value
return ret
## main function
if isinstance(data, list):
return _asciify_list(data)
elif isinstance(data, dict):
return _asciify_dict(data)
elif isinstance(data, unicode):
data = _remove_accents(data)
return data.encode('utf-8')
elif isinstance(data, str):
return data
raise TypeError('Input must be dict, list, str or unicode')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.