Last active
November 1, 2019 16:04
-
-
Save harjitmoe/e695a3fd2622fdf3f67e72a871efab62 to your computer and use it in GitHub Desktop.
Use the pickle/json/marshal basic API with NBT data.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- mode: python; coding: utf-8 -*- | |
""" Use the pickle/json/marshal basic API with NBT data. """ | |
# Authored by HarJIT in 2019. This Source Code Form is subject to the terms of the Mozilla Public | |
# License, v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain one at | |
# https://mozilla.org/MPL/2.0/. | |
from ctypes import c_byte, c_int16, c_int32, c_int64, c_float, c_double | |
from collections import OrderedDict | |
import struct, io | |
END = 0 | |
BYTE = 1 | |
SHORT = 2 | |
INT = 3 | |
LONG = 4 | |
FLOAT = 5 | |
DOUBLE = 6 | |
BYTES = 7 | |
STRING = 8 | |
LIST = 9 | |
COMPOUND = 10 | |
INTS = 11 | |
LONGS = 12 | |
# Note that the nbttypewhenempty is only referenced as a last resort, i.e. for an empty array. | |
# The idea being so empty arrays round-trip if they're part of a part of the structure which | |
# the script loading/editing/dumping it doesn't do anything to. | |
# The original type itself might be a placeholder (END or BYTE) in the case of an empty list. So | |
# the content takes priority. | |
# Note also that the nbttypewhenempty does not mean the same thing between them: for NBTList it's | |
# the INNER type and for NBTTuple it's the OUTER type. For fairly obvious reasons. | |
class NBTList(list): | |
"""Subclass of list which uses a specific element type when empty and saved to NBT.""" | |
def __init__(self, typ, *args): | |
self.nbttypewhenempty = typ | |
super().__init__(*args) | |
class NBTTuple(tuple): | |
"""Subclass of tuple which uses a specific array type when empty and saved to NBT.""" | |
# The tuple class itself overrides __new__ so trying to add the typ argument | |
# by overriding __init__ does something close to squat. | |
def __new__(cls, typ, *args): | |
self = super().__new__(cls, *args) | |
self.nbttypewhenempty = typ | |
return self | |
# The idea's to be useful for serialising native Python stuff as well as the Minecraft stuff. | |
def _deduce_type(obj): | |
if isinstance(obj, c_byte): | |
return BYTE | |
elif isinstance(obj, c_int16): | |
return SHORT | |
elif isinstance(obj, c_int32): | |
return INT | |
elif isinstance(obj, c_int64): | |
return LONG | |
elif isinstance(obj, int): | |
# If passed as a Python type, assume type is insignificant and use shortest possible | |
# representation for that integer. | |
ref = obj if obj >= 0 else ~obj | |
if not (ref >> 7): | |
return BYTE | |
elif not (ref >> 15): | |
return SHORT | |
elif not (ref >> 31): | |
return INT | |
elif not (ref >> 63): | |
return LONG | |
else: | |
raise ValueError("integer too large to be stored in NBT ({!r})".format(obj)) | |
elif isinstance(obj, c_float): | |
return FLOAT | |
elif isinstance(obj, (c_double, float)): | |
# Python's native "float" is actually double-precision. | |
# Since the different float types are as much a matter of precision as maximum, | |
# don't mess around trying to multiplex them like with integers. | |
return DOUBLE | |
elif isinstance(obj, (bytes, bytearray)): | |
return BYTES | |
elif isinstance(obj, str): | |
return STRING | |
elif isinstance(obj, list): | |
return LIST | |
elif isinstance(obj, dict): | |
return COMPOUND | |
elif isinstance(obj, tuple): | |
if not obj: | |
if isinstance(obj, NBTTuple): | |
return obj.nbttypewhenempty | |
raise TypeError("cannot automatically represent untyped empty tuple") | |
# | |
# For a tuple solely of native Python integers: | |
can32bit = True | |
for i in obj: | |
if not isinstance(i, int): | |
break | |
ref = i if i >= 0 else ~i | |
can32bit = can32bit and not (ref >> 31) | |
else: # i.e. finished without encountering break | |
# Don't use BYTES, since that won't round-trip to a tuple. | |
return INTS if can32bit else LONGS | |
# | |
# For anything else: | |
inner = _deduce_type(obj[0]) | |
for i in obj: | |
if isinstance(i, int): | |
raise ValueError("mixed native and fixed-width integers in a tuple") | |
if _deduce_type(i) != inner: | |
raise ValueError("mixed-type tuples forbidden in NBT") | |
if inner == BYTE: # We've excluded native integers, so this is necessarily a c_byte. | |
return BYTES | |
elif inner == INT: | |
return INTS | |
elif inner == LONG: | |
return LONGS | |
else: | |
return LIST | |
else: | |
raise TypeError("unsupported type {!r}".format(type(obj))) | |
def _deduce_list_type(lst): | |
if not lst: | |
if isinstance(lst, NBTList): | |
return lst.nbttypewhenempty | |
else: # Empty list without a set type. | |
return BYTE # alternatively END, but that apparently breaks some older parsers. | |
status = None | |
for item in lst: | |
if isinstance(item, int) and status in (None, -1): | |
status = -1 | |
elif status is None: | |
status = _deduce_type(item) | |
elif status == _deduce_type(item): | |
pass | |
elif (status == -1) and isinstance(item, (c_byte, c_int16, c_int32, c_int64)): | |
raise ValueError("mixed native and fixed-width integers in a list") | |
elif (status != -1) and isinstance(item, int): | |
raise ValueError("mixed native and fixed-width integers in a list") | |
else: | |
raise ValueError("mixed-type lists forbidden in NBT") | |
if status == -1: | |
typ = BYTE | |
for i in lst: | |
ref = i if i >= 0 else ~i | |
if typ in (BYTE,) and (ref >> 7): | |
typ = SHORT | |
if typ in (BYTE, SHORT) and (ref >> 15): # not elif | |
typ = INT | |
if typ in (BYTE, SHORT, INT) and (ref >> 31): # not elif | |
typ = LONG | |
if typ in (BYTE, SHORT, INT, LONG) and (ref >> 63): # not elif | |
raise ValueError("integer too large to be stored in NBT ({!r})".format(i)) | |
return typ | |
elif status == -2: | |
return DOUBLE | |
else: | |
return status | |
def _dump(item, file, typ, endian): | |
# GENERAL NOTE: not using bytes() on the ctypes types besides c_byte means I don't have to care | |
# about the host's native formats. NBT is big-endian in MCJE and little-endian in MCBE. | |
if typ == BYTE: | |
if isinstance(item, c_byte): | |
file.write(bytes(item)) | |
else: | |
# bytes() requires an unsigned representation, hence the &0xFF. | |
file.write(bytes([item & 0xFF])) | |
elif typ == SHORT: | |
if isinstance(item, c_int16): | |
file.write(struct.pack(endian + "h", item.value)) | |
else: | |
file.write(struct.pack(endian + "h", item)) | |
elif typ == INT: | |
if isinstance(item, c_int32): | |
file.write(struct.pack(endian + "l", item.value)) | |
else: | |
file.write(struct.pack(endian + "l", item)) | |
elif typ == LONG: | |
if isinstance(item, c_int64): | |
file.write(struct.pack(endian + "q", item.value)) | |
else: | |
file.write(struct.pack(endian + "q", item)) | |
elif typ == FLOAT: | |
file.write(struct.pack(endian + "f", item.value)) | |
elif typ == DOUBLE: | |
if isinstance(item, c_double): | |
file.write(struct.pack(endian + "d", item.value)) | |
else: | |
file.write(struct.pack(endian + "d", item)) | |
elif typ == BYTES: | |
file.write(struct.pack(endian + "l", len(item))) | |
# Might be passed to us as bytes, bytearray or tuple | |
if item and isinstance(item[0], c_byte): | |
# Accepted for consistency with such representations for INTS and LONGS but | |
# not recommended; loads() will return a bytes object rather than this format. | |
file.write(b"".join(bytes(i) for i in item)) | |
else: | |
file.write(bytes(item)) | |
elif typ == STRING: | |
bitem = item.encode("utf-8") | |
file.write(struct.pack(endian + "h", len(bitem))) | |
file.write(bitem) | |
elif typ == LIST: | |
inner = _deduce_list_type(item) | |
file.write(struct.pack(endian + "B", inner)) | |
file.write(struct.pack(endian + "l", len(item))) | |
for i in item: | |
_dump(i, file, inner, endian) | |
elif typ == COMPOUND: | |
for (name, i) in item.items(): | |
typ = _deduce_type(i) | |
if not isinstance(name, str): | |
raise ValueError("keys must be strings") | |
bname = name.encode("utf-8") | |
file.write(struct.pack(endian + "B", typ)) | |
file.write(struct.pack(endian + "H", len(bname))) | |
file.write(bname) | |
_dump(i, file, typ, endian) | |
file.write(struct.pack(endian + "B", END)) | |
elif typ == INTS: | |
file.write(struct.pack(endian + "l", len(item))) | |
if not item: | |
pass | |
# mixed ctypes and native int would already have been raised as an error by this point. | |
elif isinstance(item[0], c_int32): | |
file.write(struct.pack(endian + ("l" * len(item)), *tuple(i.value for i in item))) | |
else: | |
file.write(struct.pack(endian + ("l" * len(item)), *item)) | |
elif typ == LONGS: | |
file.write(struct.pack(endian + "l", len(item))) | |
if not item: | |
pass | |
# mixed ctypes and native int would already have been raised as an error by this point. | |
elif isinstance(item[0], c_int64): | |
file.write(struct.pack(endian + ("q" * len(item)), *tuple(i.value for i in item))) | |
else: | |
file.write(struct.pack(endian + ("q" * len(item)), *item)) | |
else: | |
raise RuntimeError("unexpected `typ` value; this should not happen ({!r})".format(typ)) | |
def dump(obj, file, *, name=None, endian=">"): | |
""" Write an object (obj) as uncompressed NBT to a file (file). | |
The optional name argument gives a name to the root tag. | |
The endian argument takes the same format as the struct module. It | |
should be set to ">" for MCJE data (the default) or to "<" for | |
MCBE (MCPE) data. | |
""" | |
file.write(struct.pack(endian + "B", _deduce_type(obj))) | |
if name is None: | |
file.write(struct.pack(endian + "H", 0)) | |
elif not isinstance(name, str): | |
raise ValueError("keys must be strings") | |
else: | |
bname = name.encode("utf-8") | |
file.write(struct.pack(endian + "H", len(bname))) | |
file.write(bname) | |
_dump(obj, file, _deduce_type(obj), endian) | |
def dumps(obj, *, name=None, endian=">"): | |
""" Convert an object (obj) to uncompressed NBT byte data. | |
The optional name argument gives a name to the root tag. | |
The endian argument takes the same format as the struct module. It | |
should be set to ">" for MCJE data (the default) or to "<" for | |
MCBE (MCPE) data. | |
""" | |
f = io.BytesIO() | |
dump(obj, f, name=name, endian=endian) | |
return f.getvalue() | |
def _load(file, typ, preserve, endian): | |
if typ == BYTE: | |
ret, = file.read(1) | |
if preserve: | |
ret = c_byte(ret) | |
return ret | |
elif typ == SHORT: | |
ret, = struct.unpack(endian + "h", file.read(2)) | |
if preserve: | |
ret = c_int16(ret) | |
return ret | |
elif typ == INT: | |
ret, = struct.unpack(endian + "l", file.read(4)) | |
if preserve: | |
ret = c_int32(ret) | |
return ret | |
elif typ == LONG: | |
ret, = struct.unpack(endian + "q", file.read(8)) | |
if preserve: | |
ret = c_int64(ret) | |
return ret | |
elif typ == FLOAT: | |
ret, = struct.unpack(endian + "f", file.read(4)) | |
if preserve: | |
ret = c_float(ret) | |
return ret | |
elif typ == DOUBLE: | |
ret, = struct.unpack(endian + "d", file.read(8)) | |
if preserve: | |
ret = c_double(ret) | |
return ret | |
elif typ == BYTES: | |
length, = struct.unpack(endian + "l", file.read(4)) | |
return file.read(length) | |
elif typ == STRING: | |
length, = struct.unpack(endian + "h", file.read(2)) | |
return file.read(length).decode("utf-8", errors="replace") | |
elif typ == LIST: | |
inner, = struct.unpack(endian + "B", file.read(1)) | |
length, = struct.unpack(endian + "l", file.read(4)) | |
ret = [_load(file, inner, preserve, endian) for i in range(length)] | |
return NBTList(inner, ret) if preserve else ret | |
elif typ == COMPOUND: | |
# OrderedDict is guaranteed to be ordered; this has only been guaranteed for dict since | |
# Python 3.7 (since dictionary order was implementation-defined beforehand, when this was | |
# actually introduced depends on implementation: it was introduced by CPython in version | |
# 3.6). Order of dict is non-deterministic in CPython 3.5 and arbitrary (yet deterministic) | |
# in earlier versions. | |
# Hence if we're preserving info, we want to guarantee preservation of order. | |
ret = OrderedDict({}) if preserve else {} | |
while 1: | |
name, i = load(file, preserve=preserve, with_name=True, endian=endian) # Not _load | |
if i is None: | |
break | |
elif name not in ret: | |
ret[name] = i | |
else: | |
raise ValueError("duplicate key: {!r}".format(name)) | |
return ret | |
elif typ == INTS: | |
length, = struct.unpack(endian + "l", file.read(4)) | |
ret = struct.unpack(endian + ("l" * length), file.read(4 * length)) if length else () | |
if preserve: | |
ret = NBTTuple(typ, (c_int32(i) for i in ret)) | |
return ret | |
elif typ == LONGS: | |
length, = struct.unpack(endian + "l", file.read(4)) | |
ret = struct.unpack(endian + ("q" * length), file.read(8 * length)) if length else () | |
if preserve: | |
ret = NBTTuple(typ, (c_int64(i) for i in ret)) | |
return ret | |
else: | |
raise ValueError("unsupported NBT field type {!r}".format(typ)) | |
def load(file, *, preserve=True, with_name=False, endian=">"): | |
""" Read uncompressed NBT data from a file (file), return its content. | |
If preserve=True (the default), efforts will be made to preserve the | |
original NBT data as faithfully as possible (in particular, numerical | |
values will be represented as ctypes wrappers). If preserve=False, | |
values will be returned in their natural Python representations, and | |
may not retain the same types if dumped again. | |
If with_name=False (the default), the object will be returned; an | |
error will be thrown if the root tag is named. If with_name=True, | |
a tuple of the name and the object will be returned. Both of these | |
are false in event of an END tag. | |
The endian argument takes the same format as the struct module. It | |
should be set to ">" for MCJE data (the default) or to "<" for | |
MCBE (MCPE) data. | |
""" | |
typ, = struct.unpack(endian + "B", file.read(1)) | |
if typ == END: | |
return (None, None) if with_name else None | |
namelen, = struct.unpack(endian + "H", file.read(2)) | |
name = file.read(namelen).decode("utf-8", errors="replace") | |
if with_name: | |
return name, _load(file, typ, preserve, endian) | |
elif name: | |
raise ValueError("named root tag without with_name=True") | |
else: | |
return _load(file, typ, preserve, endian) | |
def loads(dat, *, preserve=True, with_name=False, endian=">"): | |
""" Load an object from uncompressed NBT data (dat). | |
If preserve=True (the default), efforts will be made to preserve the | |
original NBT data as faithfully as possible (in particular, numerical | |
values will be represented as ctypes wrappers). If preserve=False, | |
values will be returned in their natural Python representations, and | |
may not retain the same types if dumped again. | |
If with_name=False (the default), the object will be returned; an | |
error will be thrown if the root tag is named. If with_name=True, | |
a tuple of the name (even if an empty string) and the object will | |
be returned. Both of these are None in event of an END tag. | |
The endian argument takes the same format as the struct module. It | |
should be set to ">" for MCJE data (the default) or to "<" for | |
MCBE (MCPE) data. | |
""" | |
return load(io.BytesIO(dat), preserve=preserve, with_name=with_name, endian=endian) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment