Skip to content

Instantly share code, notes, and snippets.

@mgd020
Created August 27, 2018 23:08
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save mgd020/c869d4b81e8b80030e18ad90c90fdb66 to your computer and use it in GitHub Desktop.
An example implementation of binary json
import struct
import sys
"""
encoding is little endian
header details: (lsb to msb)
int7 1
length 7 bits of int [-64, 63]
else 0
obj 11
length 5 bits of uint [0, 30]
...
array 10
length 5 bits of uint [0, 30]
...
string 01
length 5 bits of uint [0, 30]
...
else 00
number 1
int8 00
unsigned 0
signed 1
int16 01
unsigned 0
signed 1
int32 10
unsigned 0
signed 1
float 11
32 0
64 1
literal 0
none 0
bool 1
false 0
true 1
todo:
not happy with decoding
not happy with performance in general (although haven't tested)
javascript library
"""
INT7 = 0b00000001 # NOQA
NESTED = 0b00000110 # NOQA
OBJECT = 0b00000110 # NOQA
ARRAY = 0b00000100 # NOQA
STRING = 0b00000010 # NOQA
NUMBER = 0b00001000 # NOQA
UINT8 = 0b00001000 # NOQA
INT8 = 0b01001000 # NOQA
UINT16 = 0b00011000 # NOQA
INT16 = 0b01011000 # NOQA
UINT32 = 0b00101000 # NOQA
INT32 = 0b01101000 # NOQA
FLOAT = 0b00111000 # NOQA
FLOAT32 = 0b00111000 # NOQA
FLOAT64 = 0b01111000 # NOQA
NONE = 0b00000000 # NOQA
FALSE = 0b00010000 # NOQA
TRUE = 0b00110000 # NOQA
INT7_SHIFT = 1
BYTE_MASK = 0xff
INLINE_LEN_SHIFT = 3
INLINE_LEN_MAX = 2 ** 5 - 1 # 5 bits
FLOAT_INT_MIN = -9007199254740991
INT_MIN = -2147483648
INT16_MIN = -32768
INT8_MIN = -128
INT7_MIN = -64
INT7_MAX = 63
UINT8_MAX = 255
UINT16_MAX = 65535
UINT32_MAX = 4294967295
FLOAT_INT_MAX = 9007199254740991
ENCODE_LITERALS = {
None: NONE,
True: TRUE,
False: FALSE,
}
ENCODE_OBJECT_KEYS = {
None: 'null',
False: 'false',
True: 'true',
}
DECODE_FORMAT = {
UINT8: 'B',
INT8: 'b',
UINT16: 'H',
INT16: 'h',
UINT32: 'I',
INT32: 'i',
FLOAT32: 'f',
FLOAT64: 'd',
}
DECODE_SIZE = {
typ: struct.calcsize(fmt)
for typ, fmt in DECODE_FORMAT.items()
}
DECODE_LITERALS = {
NONE: None,
TRUE: True,
FALSE: False,
}
FLOAT_EPSILON = sys.float_info.epsilon
def encode(obj):
if obj is False or obj is True or obj is None:
return encode_literal(obj)
if isinstance(obj, (int, float)):
return encode_number(obj)
if isinstance(obj, str):
return encode_string(obj)
if isinstance(obj, (list, tuple)):
return encode_array(obj)
if isinstance(obj, dict):
return encode_object(obj)
raise ValueError(obj)
def encode_literal(obj):
try:
return struct.pack('<B', ENCODE_LITERALS[obj])
except KeyError:
raise ValueError(obj)
def encode_number(obj):
if isinstance(obj, int) or isinstance(obj, float) and obj.is_integer():
try:
return encode_int(obj)
except OverflowError:
pass
return encode_float(obj)
def encode_int(obj):
obj = int(obj)
if obj is 0:
return struct.pack('<B', INT7)
if obj < INT_MIN:
raise OverflowError(obj)
if obj < INT16_MIN:
return struct.pack('<Bi', INT32, obj)
if obj < INT8_MIN:
return struct.pack('<Bh', INT16, obj)
if obj < INT7_MIN:
return struct.pack('<Bb', INT8, obj)
if obj <= INT7_MAX:
return struct.pack('<B', (INT7 | (obj << INT7_SHIFT)) & BYTE_MASK)
if obj <= UINT8_MAX:
return struct.pack('<BB', UINT8, obj)
if obj <= UINT16_MAX:
return struct.pack('<BH', UINT16, obj)
if obj <= UINT32_MAX:
return struct.pack('<BI', UINT32, obj)
raise OverflowError(obj)
def encode_float(obj):
# try to put it in 32 (see if it truncates)
obj = float(obj)
if obj.is_integer() and (obj < FLOAT_INT_MIN or obj > FLOAT_INT_MAX):
raise OverflowError(obj)
f32 = struct.pack('f', obj)
if abs(struct.unpack('f', f32)[0] - obj) < FLOAT_EPSILON:
return struct.pack('<B', FLOAT32) + f32
return struct.pack('<Bd', FLOAT64, obj)
def encode_string(obj):
return encode_values(STRING, obj.encode())
def encode_array(obj):
return encode_values(ARRAY, b''.join(encode(o) for o in obj))
def encode_object(obj):
objs = []
for key, value in obj.items():
if key is None or key is False or key is True:
key = ENCODE_OBJECT_KEYS[key]
elif isinstance(key, (int, float)):
key = str(key)
elif not isinstance(key, str):
raise ValueError(key)
key, value = encode_string(key), encode(value)
objs.append(key)
objs.append(value)
return encode_values(OBJECT, b''.join(objs))
def encode_values(type_, obj):
obj_len = len(obj)
output = []
if obj_len < INLINE_LEN_MAX:
output.append(struct.pack('<B', type_ | (obj_len << INLINE_LEN_SHIFT)))
else:
output.append(struct.pack('<B', type_ | (INLINE_LEN_MAX << INLINE_LEN_SHIFT)))
output.append(encode_int(obj_len))
output.append(obj)
return b''.join(output)
def decode(bytez):
bytez = memoryview(bytez)
try:
return decode_value(bytez)[0]
finally:
bytez.release()
def decode_value(bytez):
head = bytez[0]
if head & INT7:
return decode_number(bytez)
nested = head & NESTED
if nested == OBJECT:
return decode_object(bytez)
if nested == ARRAY:
return decode_array(bytez)
if nested == STRING:
return decode_string(bytez)
if head & NUMBER:
return decode_number(bytez)
return decode_literal(bytez)
def decode_number(bytez):
head = bytez[0]
if head & INT7:
return struct.unpack('<b', bytez[:1].tobytes())[0] >> INT7_SHIFT, bytez[1:]
if not (head & NUMBER):
raise ValueError(head)
end = DECODE_SIZE[head] + 1
return struct.unpack('<' + DECODE_FORMAT[head], bytez[1:end].tobytes())[0], bytez[end:]
def decode_object(bytez):
val_len, bytez = decode_values(OBJECT, bytez)
my_bytez, bytez = bytez[:val_len], bytez[val_len:]
items = []
while my_bytez:
key, my_bytez = decode_string(my_bytez)
value, my_bytez = decode_value(my_bytez)
items.append((key, value))
return dict(items), bytez
def decode_array(bytez):
val_len, bytez = decode_values(ARRAY, bytez)
my_bytez, bytez = bytez[:val_len], bytez[val_len:]
items = []
while my_bytez:
item, my_bytez = decode_value(my_bytez)
items.append(item)
return items, bytez
def decode_string(bytez):
val_len, bytez = decode_values(STRING, bytez)
my_bytez, bytez = bytez[:val_len], bytez[val_len:]
string = my_bytez.tobytes().decode()
return string, bytez
def decode_values(type_, bytez):
head = bytez[0]
if not (head & type_):
raise ValueError(head)
val_len = head >> INLINE_LEN_SHIFT
if val_len == INLINE_LEN_MAX:
return decode_number(bytez[1:])
return val_len, bytez[1:]
def decode_literal(bytez):
try:
return DECODE_LITERALS[bytez[0]], bytez[1:]
except KeyError:
raise ValueError(bytez[0])
EXAMPLE = """
[
{
"_id": "5b839f9a57ffc84c14d467ec",
"index": 0,
"guid": "a14e6e3c-4af7-4e95-a10e-440e5854c096",
"isActive": false,
"balance": "$2,636.83",
"picture": "http://placehold.it/32x32",
"age": 27,
"eyeColor": "green",
"name": {
"first": "Palmer",
"last": "Barrera"
},
"company": "INSURON",
"email": "palmer.barrera@insuron.tv",
"phone": "+1 (996) 462-2803",
"address": "117 Congress Street, Crumpler, Michigan, 7879",
"about": "Exercitation anim laborum commodo culpa consequat nisi adipisicing laboris. Magna duis est non et excepteur nisi est reprehenderit quis amet sit magna amet. Ex consequat commodo amet dolor laborum proident. Velit do nostrud ex consequat officia non labore eiusmod sunt elit adipisicing. Officia consequat amet velit irure. Et dolore incididunt do commodo laborum pariatur dolore.",
"registered": "Saturday, June 16, 2018 7:27 PM",
"latitude": "-31.69379",
"longitude": "53.709662",
"tags": [
"dolore",
"enim",
"fugiat",
"officia",
"ipsum"
],
"range": [
0,
1,
2,
3,
4,
5,
6,
7,
8,
9
],
"friends": [
{
"id": 0,
"name": "Bonita Fletcher"
},
{
"id": 1,
"name": "Selena Head"
},
{
"id": 2,
"name": "Walters Holmes"
}
],
"greeting": "Hello, Palmer! You have 10 unread messages.",
"favoriteFruit": "apple"
},
{
"_id": "5b839f9aaed8a286ea47bff6",
"index": 1,
"guid": "37320035-0be9-46c2-8a13-8fd1ed77625a",
"isActive": true,
"balance": "$1,799.34",
"picture": "http://placehold.it/32x32",
"age": 35,
"eyeColor": "green",
"name": {
"first": "Montgomery",
"last": "Mcintosh"
},
"company": "KONGLE",
"email": "montgomery.mcintosh@kongle.net",
"phone": "+1 (804) 592-3830",
"address": "705 Broadway , Boling, Pennsylvania, 5130",
"about": "Ullamco est non laborum exercitation aliquip. Reprehenderit commodo magna labore sunt laboris fugiat fugiat est laboris laboris culpa. Id officia aliquip sit esse mollit excepteur Lorem excepteur sunt et. Nulla ut aliqua nisi irure ad nostrud deserunt nisi minim. Laborum laboris laboris laborum enim sint tempor.",
"registered": "Tuesday, January 26, 2016 2:21 PM",
"latitude": "10.810524",
"longitude": "43.073251",
"tags": [
"velit",
"eiusmod",
"Lorem",
"excepteur",
"proident"
],
"range": [
0,
1,
2,
3,
4,
5,
6,
7,
8,
9
],
"friends": [
{
"id": 0,
"name": "Goff Nunez"
},
{
"id": 1,
"name": "Maribel Cook"
},
{
"id": 2,
"name": "Walter Camacho"
}
],
"greeting": "Hello, Montgomery! You have 7 unread messages.",
"favoriteFruit": "apple"
},
{
"_id": "5b839f9a440b50286505b90b",
"index": 2,
"guid": "7517462e-2e78-4ac4-9d0b-db2ae167e1c2",
"isActive": true,
"balance": "$3,714.79",
"picture": "http://placehold.it/32x32",
"age": 20,
"eyeColor": "blue",
"name": {
"first": "Rocha",
"last": "Petty"
},
"company": "ACRUEX",
"email": "rocha.petty@acruex.info",
"phone": "+1 (822) 547-3307",
"address": "373 Bedford Place, Soham, Missouri, 4000",
"about": "Ea veniam reprehenderit sunt id velit fugiat culpa labore reprehenderit amet sint reprehenderit in. Adipisicing tempor sunt occaecat est irure pariatur elit dolor. Exercitation enim ea nostrud enim Lorem nostrud sunt. Velit laborum in do Lorem cupidatat amet sint aliquip reprehenderit nisi pariatur.",
"registered": "Thursday, November 23, 2017 3:08 AM",
"latitude": "-11.02797",
"longitude": "-108.727382",
"tags": [
"proident",
"exercitation",
"velit",
"anim",
"ea"
],
"range": [
0,
1,
2,
3,
4,
5,
6,
7,
8,
9
],
"friends": [
{
"id": 0,
"name": "Jerri Harvey"
},
{
"id": 1,
"name": "Juliana Guerrero"
},
{
"id": 2,
"name": "Phoebe Bradley"
}
],
"greeting": "Hello, Rocha! You have 9 unread messages.",
"favoriteFruit": "apple"
},
{
"_id": "5b839f9a03a1f92e4ee91afb",
"index": 3,
"guid": "05fc1f34-813a-4830-9e59-10f5179279ac",
"isActive": true,
"balance": "$1,546.57",
"picture": "http://placehold.it/32x32",
"age": 20,
"eyeColor": "blue",
"name": {
"first": "Dawson",
"last": "Cote"
},
"company": "GROK",
"email": "dawson.cote@grok.biz",
"phone": "+1 (994) 594-3002",
"address": "702 Irving Street, Turpin, Georgia, 3260",
"about": "Minim exercitation fugiat commodo et esse sunt laboris non. Reprehenderit laborum deserunt culpa voluptate incididunt et irure cillum enim. Minim cupidatat non pariatur deserunt nulla amet amet aliqua anim ut.",
"registered": "Sunday, April 16, 2017 8:46 AM",
"latitude": "-9.625026",
"longitude": "166.046194",
"tags": [
"exercitation",
"est",
"occaecat",
"proident",
"dolor"
],
"range": [
0,
1,
2,
3,
4,
5,
6,
7,
8,
9
],
"friends": [
{
"id": 0,
"name": "Lacey Gardner"
},
{
"id": 1,
"name": "Courtney Nixon"
},
{
"id": 2,
"name": "Megan Franks"
}
],
"greeting": "Hello, Dawson! You have 6 unread messages.",
"favoriteFruit": "apple"
},
{
"_id": "5b839f9a4d28e7bf56fdebcf",
"index": 4,
"guid": "210c3e75-dbd7-44aa-bcb6-2b57568e5e15",
"isActive": true,
"balance": "$2,880.71",
"picture": "http://placehold.it/32x32",
"age": 33,
"eyeColor": "green",
"name": {
"first": "Doris",
"last": "Norris"
},
"company": "PARCOE",
"email": "doris.norris@parcoe.name",
"phone": "+1 (958) 435-2989",
"address": "185 Autumn Avenue, Gilmore, Nebraska, 2062",
"about": "Quis dolor ipsum aliquip consectetur pariatur. Nisi anim tempor laboris commodo irure nulla dolore adipisicing quis labore commodo ea. Minim ullamco aute commodo consequat duis irure. Commodo do eiusmod ipsum nulla est duis fugiat. Ipsum minim proident eu id deserunt laborum anim.",
"registered": "Sunday, March 18, 2018 5:25 AM",
"latitude": "80.887345",
"longitude": "71.54529",
"tags": [
"commodo",
"ut",
"magna",
"dolore",
"nisi"
],
"range": [
0,
1,
2,
3,
4,
5,
6,
7,
8,
9
],
"friends": [
{
"id": 0,
"name": "Acevedo Sloan"
},
{
"id": 1,
"name": "Winnie Mccormick"
},
{
"id": 2,
"name": "Mccarty Hubbard"
}
],
"greeting": "Hello, Doris! You have 9 unread messages.",
"favoriteFruit": "apple"
},
{
"_id": "5b839f9ab52000fbbead1ce7",
"index": 5,
"guid": "0bb4b78d-74f8-405c-8ed8-c5d8e789843c",
"isActive": true,
"balance": "$1,535.46",
"picture": "http://placehold.it/32x32",
"age": 30,
"eyeColor": "brown",
"name": {
"first": "Marianne",
"last": "Brooks"
},
"company": "VIASIA",
"email": "marianne.brooks@viasia.us",
"phone": "+1 (846) 555-3165",
"address": "171 Gates Avenue, Calvary, Iowa, 3995",
"about": "Tempor culpa dolore aliqua mollit proident ipsum quis Lorem. Veniam qui laborum do nulla tempor est. Tempor amet aute magna sunt nulla esse ut consectetur excepteur. Lorem mollit ex duis exercitation. Mollit consectetur mollit incididunt quis. Esse dolore eiusmod nostrud adipisicing eiusmod laborum occaecat.",
"registered": "Wednesday, October 19, 2016 4:22 AM",
"latitude": "16.225098",
"longitude": "-10.813529",
"tags": [
"officia",
"ex",
"eu",
"excepteur",
"laboris"
],
"range": [
0,
1,
2,
3,
4,
5,
6,
7,
8,
9
],
"friends": [
{
"id": 0,
"name": "Iva Vinson"
},
{
"id": 1,
"name": "Trevino Daniels"
},
{
"id": 2,
"name": "Charles Benson"
}
],
"greeting": "Hello, Marianne! You have 7 unread messages.",
"favoriteFruit": "banana"
}
]
"""
if __name__ == '__main__':
import json
import timeit
data = json.loads(EXAMPLE)
json_data = json.dumps(data)
print('json', 'dump:', '%.2fms' % sum(timeit.repeat(lambda: json.dumps(data), number=1000)), 'load:', '%.2fms' % sum(timeit.repeat(lambda: json.loads(json_data), number=1000)))
bin_data = encode(data)
print('bin', 'dump:', '%.2fms' % sum(timeit.repeat(lambda: encode(data), number=1000)), 'load:', '%.2fms' % sum(timeit.repeat(lambda: decode(bin_data), number=1000)))
print('json', len(json_data), '-> bin', len(bin_data), '=', '%.1f%%' % (len(bin_data) / len(json_data) * 100))
tests = [
(None, b'\x00', None),
(False, b'\x10', False),
(True, b'\x30', True),
(0, b'\x01', 0),
(1, b'\x03', 1),
(-1, b'\xff', -1),
(63, b'\x7f', 63),
(64, b'\x08\x40', 64),
(-64, b'\x81', -64),
(-65, b'\x48\xbf', -65),
(255, b'\x08\xff', 255),
(256, b'\x18\x00\x01', 256),
(-128, b'\x48\x80', -128),
(-129, b'\x58\x7f\xff', -129),
(65535, b'\x18\xff\xff', 65535),
(65536, b'\x28\x00\x00\x01\x00', 65536),
(-32768, b'\x58\x00\x80', -32768),
(-32769, b'\x68\xff\x7f\xff\xff', -32769),
(4294967295, b'\x28\xff\xff\xff\xff', 4294967295),
(4294967296, b'\x38\x00\x00\x80\x4F', 4294967296),
(-2147483648, b'\x68\x00\x00\x00\x80', -2147483648),
(-2147483649, b'\x78\x00\x00\x20\x00\x00\x00\xe0\xc1', -2147483649),
(9007199254740991, b'\x78\xff\xff\xff\xff\xff\xff\x3f\x43', 9007199254740991),
(9007199254740992, OverflowError, None),
(-9007199254740991, b'\x78\xff\xff\xff\xff\xff\xff\x3f\xc3', -9007199254740991),
(-9007199254740992, OverflowError, None),
(1.0, b'\x03', 1.0),
(1.5, b'\x38\x00\x00\xc0\x3f', 1.5),
(1.8, b'\x78\xcd\xcc\xcc\xcc\xcc\xcc\xfc\x3f', 1.8),
('', b'\x02', ''),
('a', b'\x0a\x61', 'a'),
('a' * 30, b'\xf2' + (b'a' * 30), 'a' * 30),
('a' * 31, b'\xfa\x3f' + (b'a' * 31), 'a' * 31),
([], b'\x04', []),
((), b'\x04', []),
([None], b'\x0c\x00', [None]),
([None] * 30, b'\xf4' + (b'\x00' * 30), [None] * 30),
([None] * 31, b'\xfc\x3f' + (b'\x00' * 31), [None] * 31),
({}, b'\x06', {}),
({1: None}, b'\x1e\x0a\x31\x00', {'1': None}),
({None: 1, False: 2, True: 3}, b'\x9e\x22\x6e\x75\x6c\x6c\x03\x2a\x66\x61\x6c\x73\x65\x05\x22\x74\x72\x75\x65\x07', {'null': 1, 'false': 2, 'true': 3}),
({i: None for i in range(30)}, b'\xfe\x08\x6e\x0a\x30\x00\x0a\x31\x00\x0a\x32\x00\x0a\x33\x00\x0a\x34\x00\x0a\x35\x00\x0a\x36\x00\x0a\x37\x00\x0a\x38\x00\x0a\x39\x00\x12\x31\x30\x00\x12\x31\x31\x00\x12\x31\x32\x00\x12\x31\x33\x00\x12\x31\x34\x00\x12\x31\x35\x00\x12\x31\x36\x00\x12\x31\x37\x00\x12\x31\x38\x00\x12\x31\x39\x00\x12\x32\x30\x00\x12\x32\x31\x00\x12\x32\x32\x00\x12\x32\x33\x00\x12\x32\x34\x00\x12\x32\x35\x00\x12\x32\x36\x00\x12\x32\x37\x00\x12\x32\x38\x00\x12\x32\x39\x00', {str(i): None for i in range(30)}),
({i: None for i in range(31)}, b'\xfe\x08\x72\x0a\x30\x00\x0a\x31\x00\x0a\x32\x00\x0a\x33\x00\x0a\x34\x00\x0a\x35\x00\x0a\x36\x00\x0a\x37\x00\x0a\x38\x00\x0a\x39\x00\x12\x31\x30\x00\x12\x31\x31\x00\x12\x31\x32\x00\x12\x31\x33\x00\x12\x31\x34\x00\x12\x31\x35\x00\x12\x31\x36\x00\x12\x31\x37\x00\x12\x31\x38\x00\x12\x31\x39\x00\x12\x32\x30\x00\x12\x32\x31\x00\x12\x32\x32\x00\x12\x32\x33\x00\x12\x32\x34\x00\x12\x32\x35\x00\x12\x32\x36\x00\x12\x32\x37\x00\x12\x32\x38\x00\x12\x32\x39\x00\x12\x33\x30\x00', {str(i): None for i in range(31)}),
({i: None for i in range(32)}, b'\xfe\x08\x76\x0a\x30\x00\x0a\x31\x00\x0a\x32\x00\x0a\x33\x00\x0a\x34\x00\x0a\x35\x00\x0a\x36\x00\x0a\x37\x00\x0a\x38\x00\x0a\x39\x00\x12\x31\x30\x00\x12\x31\x31\x00\x12\x31\x32\x00\x12\x31\x33\x00\x12\x31\x34\x00\x12\x31\x35\x00\x12\x31\x36\x00\x12\x31\x37\x00\x12\x31\x38\x00\x12\x31\x39\x00\x12\x32\x30\x00\x12\x32\x31\x00\x12\x32\x32\x00\x12\x32\x33\x00\x12\x32\x34\x00\x12\x32\x35\x00\x12\x32\x36\x00\x12\x32\x37\x00\x12\x32\x38\x00\x12\x32\x39\x00\x12\x33\x30\x00\x12\x33\x31\x00', {str(i): None for i in range(32)}),
]
# import traceback
# TODO: decode partials
for obj, data, obj_out in tests:
try:
try:
expected = data
encoded = encode(obj)
recieved = encoded
except Exception as e:
recieved = type(e)
# traceback.print_exc()
assert recieved == expected, 'error encoding'
if isinstance(recieved, type) and issubclass(recieved, Exception):
continue
try:
expected = obj_out
decoded = decode(recieved)
recieved = decoded
except Exception as e:
recieved = type(e)
# traceback.print_exc()
assert recieved == expected, 'error decoding'
except Exception as e:
print('Input: %r\nExpected: %r\nGot: %r' % (obj, expected, recieved))
raise
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment