Skip to content

Instantly share code, notes, and snippets.

@chris-hailstorm
Last active December 31, 2021 19:21
Show Gist options
  • Save chris-hailstorm/4989643 to your computer and use it in GitHub Desktop.
Save chris-hailstorm/4989643 to your computer and use it in GitHub Desktop.
Unicode to ASCII / UTF-8 converter for Python dicts, lists, strings and nested combinations of dicts, lists and strings
def asciify(data):
"""
SYNOPSIS
Asciifies strings, lists and dicts, and nested versions of same
DESCRIPTION
The JSON spec (http://www.ietf.org/rfc/rfc4627.txt) -- "JSON text SHALL
be encoded in Unicode". For apps that don't use unicode, this function
walks through all levels of a JSON data structure and converts each item
to ASCII. See http://stackoverflow.com/questions/956867/ for original.
Can be used for any nesting of strings / lists / dicts, e.g. a list of
dicts, a dict in which values are lists of strings etc. See LIMITATIONS.
PARAMETERS
data A string, unicode, list or dict, or nested versions of the
same types. Typically the string output from json.dumps()
or the dict resulting from json.load() or json.loads().
RETURNS
A Python dictionary with all keys and values converted to UTF-8.
USAGE
There are several equivalent ways to use this function.
(1) asciify string version of data structure before creating dict:
s = json.dumps(x)
d = json.loads(asciify(s))
(2) create dict from string version of data structure, then asciify:
s = json.dumps(x)
d = json.loads(s)
d = asciify(d)
(3) asciify as the dict is being created via object hook:
s = json.dumps(x)
d = json.loads(s, object_hook=asciify)
Asciifying the string first (approach (1) above) is probably the best
approach since the input is a flat string and there's no possibility of
the depth traversal stopping due to an unknown type. See LIMITATIONS.
EXAMPLES
>>> import json
>>> s1 = 'ASCII string'
>>> type(s1)
<type 'str'>
>>> s1 = asciify(s1)
>>> type(s1)
<type 'str'>
>>> s2 = u'Unicode string'
>>> type(s2)
<type 'unicode'>
>>> s2 = asciify(s2)
>>> type(s2)
<type 'str'>
>>> s3 = 'Nestl'+unichr(0xe9)
>>> print asciify(s3)
Nestle
>>> asciify(['a','b','c'])
['a', 'b', 'c']
>>> asciify([u'a',u'b',u'c'])
['a', 'b', 'c']
>>> asciify({'a':'aa','b':'bb','c':'cc'})
{'a': 'aa', 'c': 'cc', 'b': 'bb'}
>>> asciify({u'a':'aa','b':u'bb',u'c':u'cc'})
{'a': 'aa', 'c': 'cc', 'b': 'bb'}
>>> d = dict(a='a1',b='b2',c=dict(d='d3',e=['e4','e5','e6'],f=dict(g='g7')),h=[8,9,10])
>>> print d
{'a': 'a1', 'h': [8, 9, 10], 'c': {'e': ['e4', 'e5', 'e6'], 'd': 'd3', 'f': {'g': 'g7'}}, 'b': 'b2'}
>>> print type(d)
<type 'dict'>
>>> asciistr = json.dumps(d)
>>> print asciistr
{"a": "a1", "h": [8, 9, 10], "c": {"e": ["e4", "e5", "e6"], "d": "d3", "f": {"g": "g7"}}, "b": "b2"}
>>> print type(asciistr)
<type 'str'>
>>> unidict = json.loads(asciistr)
>>> print unidict
{u'a': u'a1', u'h': [8, 9, 10], u'c': {u'e': [u'e4', u'e5', u'e6'], u'd': u'd3', u'f': {u'g': u'g7'}}, u'b': u'b2'}
>>> print type(unidict)
<type 'dict'>
>>> unidict == d
True
>>> asciidict1 = asciify(unidict)
>>> print asciidict1
{'a': 'a1', 'h': [8, 9, 10], 'c': {'e': ['e4', 'e5', 'e6'], 'd': 'd3', 'f': {'g': 'g7'}}, 'b': 'b2'}
>>> print type(asciidict1)
<type 'dict'>
>>> asciidict1 == d
True
>>> asciidict2 = json.loads(asciistr, object_hook=asciify)
>>> print asciidict2
{'a': 'a1', 'h': [8, 9, 10], 'c': {'e': ['e4', 'e5', 'e6'], 'd': 'd3', 'f': {'g': 'g7'}}, 'b': 'b2'}
>>> print type(asciidict2)
<type 'dict'>
>>> asciidict2 == d
True
LIMITATIONS
For a multi-layered data structure (dict of lists, list of strings etc.)
depth traversal of the data structure stops when the element encountered
is not a string, unicode, list or dict. For example, in this dict:
> d = {'a': { 'b': [1, 2, set(u'x', u'y'] ), 'c': u'z' } }
... the u'x' and u'y' items are contained within a set, and therefore
would not be asciified, while u'z' is contained in a dict and would be
asciified since the breadth traversal of the structure continues.
A future @@todo could be to throw an error if a non-traversable input
is used, or have additional parameter that can allow the non-traversable
input to be used even though the result is a partial discard of data.
"""
##
## embedded functions
##
## see http://stackoverflow.com/a/517974
def _remove_accents(data):
"""
Changes accented letters to non-accented approximation, like Nestle
"""
return unicodedata.normalize('NFKD', data).encode('ascii', 'ignore')
##
def _asciify_list(data):
""" Ascii-fies list values """
ret = []
for item in data:
if isinstance(item, unicode):
item = _remove_accents(item)
item = item.encode('utf-8')
elif isinstance(item, list):
item = _asciify_list(item)
elif isinstance(item, dict):
item = _asciify_dict(item)
ret.append(item)
return ret
#
def _asciify_dict(data):
""" Ascii-fies dict keys and values """
ret = {}
for key, value in data.iteritems():
if isinstance(key, unicode):
key = _remove_accents(key)
key = key.encode('utf-8')
## note new if
if isinstance(value, unicode):
value = _remove_accents(value)
value = value.encode('utf-8')
elif isinstance(value, list):
value = _asciify_list(value)
elif isinstance(value, dict):
value = _asciify_dict(value)
ret[key] = value
return ret
##
## main function
if isinstance(data, list):
return _asciify_list(data)
elif isinstance(data, dict):
return _asciify_dict(data)
elif isinstance(data, unicode):
data = _remove_accents(data)
return data.encode('utf-8')
elif isinstance(data, str):
return data
else:
raise TypeError('Input must be dict, list, str or unicode')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment