public
Last active

Unicode to ASCII / UTF-8 converter for Python dicts, lists, strings and nested combinations of dicts, lists and strings

  • Download Gist
gistfile1.txt
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186
def asciify(data):
"""
SYNOPSIS
Asciifies strings, lists and dicts, and nested versions of same
 
DESCRIPTION
The JSON spec (http://www.ietf.org/rfc/rfc4627.txt) -- "JSON text SHALL
be encoded in Unicode". For apps that don't use unicode, this function
walks through all levels of a JSON data structure and converts each item
to ASCII. See http://stackoverflow.com/questions/956867/ for original.
 
Can be used for any nesting of strings / lists / dicts, e.g. a list of
dicts, a dict in which values are lists of strings etc. See LIMITATIONS.
 
PARAMETERS
data A string, unicode, list or dict, or nested versions of the
same types. Typically the string output from json.dumps()
or the dict resulting from json.load() or json.loads().
 
RETURNS
A Python dictionary with all keys and values converted to UTF-8.
 
USAGE
There are several equivalent ways to use this function.
 
(1) asciify string version of data structure before creating dict:
 
s = json.dumps(x)
d = json.loads(asciify(s))
 
(2) create dict from string version of data structure, then asciify:
 
s = json.dumps(x)
d = json.loads(s)
d = asciify(d)
 
(3) asciify as the dict is being created via object hook:
 
s = json.dumps(x)
d = json.loads(s, object_hook=asciify)
 
Asciifying the string first (approach (1) above) is probably the best
approach since the input is a flat string and there's no possibility of
the depth traversal stopping due to an unknown type. See LIMITATIONS.
 
 
EXAMPLES
>>> import json
 
>>> s1 = 'ASCII string'
>>> type(s1)
<type 'str'>
>>> s1 = asciify(s1)
>>> type(s1)
<type 'str'>
 
>>> s2 = u'Unicode string'
>>> type(s2)
<type 'unicode'>
>>> s2 = asciify(s2)
>>> type(s2)
<type 'str'>
 
>>> s3 = 'Nestl'+unichr(0xe9)
>>> print asciify(s3)
Nestle
 
>>> asciify(['a','b','c'])
['a', 'b', 'c']
 
>>> asciify([u'a',u'b',u'c'])
['a', 'b', 'c']
 
>>> asciify({'a':'aa','b':'bb','c':'cc'})
{'a': 'aa', 'c': 'cc', 'b': 'bb'}
 
>>> asciify({u'a':'aa','b':u'bb',u'c':u'cc'})
{'a': 'aa', 'c': 'cc', 'b': 'bb'}
 
>>> d = dict(a='a1',b='b2',c=dict(d='d3',e=['e4','e5','e6'],f=dict(g='g7')),h=[8,9,10])
>>> print d
{'a': 'a1', 'h': [8, 9, 10], 'c': {'e': ['e4', 'e5', 'e6'], 'd': 'd3', 'f': {'g': 'g7'}}, 'b': 'b2'}
>>> print type(d)
<type 'dict'>
 
>>> asciistr = json.dumps(d)
>>> print asciistr
{"a": "a1", "h": [8, 9, 10], "c": {"e": ["e4", "e5", "e6"], "d": "d3", "f": {"g": "g7"}}, "b": "b2"}
>>> print type(asciistr)
<type 'str'>
 
>>> unidict = json.loads(asciistr)
>>> print unidict
{u'a': u'a1', u'h': [8, 9, 10], u'c': {u'e': [u'e4', u'e5', u'e6'], u'd': u'd3', u'f': {u'g': u'g7'}}, u'b': u'b2'}
>>> print type(unidict)
<type 'dict'>
>>> unidict == d
True
 
>>> asciidict1 = asciify(unidict)
>>> print asciidict1
{'a': 'a1', 'h': [8, 9, 10], 'c': {'e': ['e4', 'e5', 'e6'], 'd': 'd3', 'f': {'g': 'g7'}}, 'b': 'b2'}
>>> print type(asciidict1)
<type 'dict'>
>>> asciidict1 == d
True
 
>>> asciidict2 = json.loads(asciistr, object_hook=asciify)
>>> print asciidict2
{'a': 'a1', 'h': [8, 9, 10], 'c': {'e': ['e4', 'e5', 'e6'], 'd': 'd3', 'f': {'g': 'g7'}}, 'b': 'b2'}
>>> print type(asciidict2)
<type 'dict'>
>>> asciidict2 == d
True
 
LIMITATIONS
For a multi-layered data structure (dict of lists, list of strings etc.)
depth traversal of the data structure stops when the element encountered
is not a string, unicode, list or dict. For example, in this dict:
 
> d = {'a': { 'b': [1, 2, set(u'x', u'y'] ), 'c': u'z' } }
 
... the u'x' and u'y' items are contained within a set, and therefore
would not be asciified, while u'z' is contained in a dict and would be
asciified since the breadth traversal of the structure continues.
 
A future @@todo could be to throw an error if a non-traversable input
is used, or have additional parameter that can allow the non-traversable
input to be used even though the result is a partial discard of data.
 
"""
##
## embedded functions
##
## see http://stackoverflow.com/a/517974
def _remove_accents(data):
"""
Changes accented letters to non-accented approximation, like Nestle
 
"""
return unicodedata.normalize('NFKD', data).encode('ascii', 'ignore')
##
def _asciify_list(data):
""" Ascii-fies list values """
ret = []
for item in data:
if isinstance(item, unicode):
item = _remove_accents(item)
item = item.encode('utf-8')
elif isinstance(item, list):
item = _asciify_list(item)
elif isinstance(item, dict):
item = _asciify_dict(item)
ret.append(item)
return ret
#
def _asciify_dict(data):
""" Ascii-fies dict keys and values """
ret = {}
for key, value in data.iteritems():
if isinstance(key, unicode):
key = _remove_accents(key)
key = key.encode('utf-8')
## note new if
if isinstance(value, unicode):
value = _remove_accents(value)
value = value.encode('utf-8')
elif isinstance(value, list):
value = _asciify_list(value)
elif isinstance(value, dict):
value = _asciify_dict(value)
ret[key] = value
return ret
##
## main function
if isinstance(data, list):
return _asciify_list(data)
elif isinstance(data, dict):
return _asciify_dict(data)
elif isinstance(data, unicode):
data = _remove_accents(data)
return data.encode('utf-8')
elif isinstance(data, str):
return data
else:
raise TypeError('Input must be dict, list, str or unicode')

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.