lukestanley/xuuid.py

## xuuid.py
# xuuid.py
"""
xuuid 0.1.1 -- Extended Universally Unique Identifiers

Extremely large UUIDs that are:
    more certainly unique,
    descriptive,
    supported by tooling for short local reference.

There is a distinction made between "hard" and "soft" data for an identifier.

"Hard" data is all of the data required for a specific identifier, -- it's unique signature.

    the type of the identifier (a string, e.g: "person", "company", "computer motherboard")
    identifying information (string, e.g: "Dell Inspiron model")
    time of creation of the identifier (a Unix timestamp)
    random data to ensure uniqueness (using secrets to provide 32 secure characters)

"Soft" data is supporting data such as:

    sourcing information -- a URL where information associated with the identifier can be found
    authentication information -- for verifying identity and correctness of some assertion
    "AKA"s -- information about what other identifiers are asserted to be equivalent to this identifier
    suggested short-naming
    additional descriptive information -- additional information about the thing described
    additional general information -- for example, suggestions on how to configure a system for use of the thing identified

AUTHORS: Spec by Lion Kimbro, implementation by Luke Stanley.

The returned "number" is a hexadecimal like string, composed of compressed msgpack.
The MsgPack has two objects, the hard dict and the soft dict.

Example usage:
>> import xuuid
>> xuuid.xuuid("Person","TimKindberg",soft_akas=["champignon.net"])
'oZKZJRMJN-s1l0-FoPw-Wcdq-ippr-iEhc-jgwI-4OjX-1RJS-ubez-SbOS-VgPF-sh6Z-t6OW-dTKg-KlVP-bxk27dtfebE'
>> xuuid.parse("9F8oYJtH1-woJE-OBQ9-aRdA-gAfk-ZZJ5-pfH8-oXQi-AwUn-UhWl-vsdn-Cb3c-em1d-PvqW-TrLf-7A7Z-zA3N-Umox-Sgpx-v6aNZqYoN2dpEa")
{
    'hard_type': 'Person',
    'hard_info': 'TimKindberg',
    'creation_time': 1662135109456161279,
    'randomness': '3a821d41b7',
    'soft_akas': ['champignon.net']
}
"""

import zlib
from sys import exit
from time import time_ns
from collections import deque
from secrets import token_bytes as random_bytes

try:
    import msgpack
except ImportError:
    print("Error: msgpack not installed. This is required. pip install msgpack")
    exit(1)


BASE_TIME = 1662125559016967741
BASE_62_ALPHABET = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"


def compress(data: bytes) -> bytes:
    compressed = zlib.compress(data, level=9)
    if len(compressed) < len(data):
        return compressed
    return data


def decompress(data: bytes) -> bytes:
    return zlib.decompress(data)


def add_dashes(data: str) -> str:
    char_index = 0
    data_with_dashes = ""
    for char in data:
        is_fourth = char_index % 4 == 0
        is_first = char_index == 0
        is_within_last_bound = char_index > len(data) - 12
        is_within_first_bound = char_index < 8
        data_with_dashes += char
        if is_fourth and not is_first and not (is_within_last_bound or is_within_first_bound):
            data_with_dashes += "-"
        char_index += 1
    return data_with_dashes


def reversible_string_data_interleaving(data: str, reverse=False) -> str:
    # rotates the data by -10 characters using deque, unless reverse is True, in which case it rotates it back:
    data = deque(data)
    if reverse:
        data.rotate(-10)
    else:
        data.rotate(+10)
    return "".join(data)


def bytes_to_base_62(data: bytes) -> str:
    """Convert bytes to base-62 format for easy copy-pasting (unlike base-64)."""
    result = ""
    num = int.from_bytes(data, byteorder="big")
    while num > 0:
        result = BASE_62_ALPHABET[num % 62] + result
        num //= 62
    return result


def base_62_to_bytes(data: str) -> bytes:
    """Convert base-62 format to bytes."""
    num = 0
    for char in data:
        num = num * 62 + BASE_62_ALPHABET.index(char)
    return num.to_bytes((num.bit_length() + 7) // 8, byteorder="big")


def xuuid(
    hard_type: str,
    hard_info: str,
    soft_sources: list = None,
    soft_auth: list = None,
    soft_akas: list = None,
    soft_short: str = None,
    soft_desc: str = None,
    soft_info: str = None,
    creation_time: int = None,
    randomness: bytes = None,
    random_byte_count: int = 5,
    use_compression: bool = True,
) -> str:
    """
    Create an xuuid.

    hard_type -- a string describing the type of the thing being identified
    hard_info -- a string describing the thing being identified
    soft_sources -- a list of URLs where information about the thing being identified can be found
    soft_auth -- a list of authentication information for verifying identity and correctness of some assertion
    soft_akas -- a list of other identifiers that are asserted to be equivalent to this identifier
    soft_short -- a suggested short-name for the thing being identified
    soft_desc -- additional descriptive information about the thing being identified
    soft_info -- additional general information about the thing being identified
    creation_time -- an integer of the time the identifier was created (defaults to now)
    randomness -- WARNING, ONLY USE FOR TESTS OR IF YOU KNOW WHAT YOU ARE DOING! random bytes to ensure uniqueness (defaults to a random byte string)
    random_byte_count -- the number of random bytes to use (defaults to 8)
    use_compression -- whether to compress the data (defaults to True)
    """
    if not creation_time:
        creation_time = time_ns()
    if not randomness:
        randomness = random_bytes(random_byte_count)
    seconds = creation_time - BASE_TIME  # seconds since base time
    hard_list = [hard_type, hard_info, seconds, randomness]
    # we make a dict from the optional keyword arguments, and then convert it to a msgpack object
    softDataDict = {}
    for local_variable_name, variable_value in locals().items():
        if local_variable_name.startswith("soft_") and variable_value is not None:
            softDataDict[local_variable_name[5:]] = variable_value

    uncompressed = msgpack.dumps([hard_list, softDataDict])
    if use_compression:
        smallest_bytes = compress(uncompressed)
    else:
        smallest_bytes = uncompressed
    based = reversible_string_data_interleaving(
        bytes_to_base_62(smallest_bytes))
    dashed_data = add_dashes(based)
    return dashed_data


def parse(xuuid: str) -> dict:
    """
    Parse an xuuid.

    xuuid -- the base62 encoded xuuid to parse
    """
    # remove all the "-" dashes in the xuuid
    xuuid = xuuid.replace("-", "")
    uninterleaved = reversible_string_data_interleaving(xuuid, True)
    xuuid_bytes = base_62_to_bytes(uninterleaved)

    # Try decompressing the data, if it fails, it is not compressed
    try:
        uncompressed = decompress(xuuid_bytes)
    except:
        uncompressed = xuuid_bytes

    hard_list, soft_dict = msgpack.loads(uncompressed)
    # add "soft_" prefix to each key in the soft_dict:
    soft_dict = {
        "soft_" + key: value for key, value in soft_dict.items()
    }
    hard_dict = {"hard_type": hard_list[0], "hard_info": hard_list[1],
                 "creation_time": hard_list[2] + BASE_TIME, "randomness": hard_list[3].hex()}
    combined_dict = {**hard_dict, **soft_dict}
    return combined_dict
	# xuuid.py
	"""
	xuuid 0.1.1 -- Extended Universally Unique Identifiers

	Extremely large UUIDs that are:
	more certainly unique,
	descriptive,
	supported by tooling for short local reference.

	There is a distinction made between "hard" and "soft" data for an identifier.

	"Hard" data is all of the data required for a specific identifier, -- it's unique signature.

	the type of the identifier (a string, e.g: "person", "company", "computer motherboard")
	identifying information (string, e.g: "Dell Inspiron model")
	time of creation of the identifier (a Unix timestamp)
	random data to ensure uniqueness (using secrets to provide 32 secure characters)

	"Soft" data is supporting data such as:

	sourcing information -- a URL where information associated with the identifier can be found
	authentication information -- for verifying identity and correctness of some assertion
	"AKA"s -- information about what other identifiers are asserted to be equivalent to this identifier
	suggested short-naming
	additional descriptive information -- additional information about the thing described
	additional general information -- for example, suggestions on how to configure a system for use of the thing identified

	AUTHORS: Spec by Lion Kimbro, implementation by Luke Stanley.

	The returned "number" is a hexadecimal like string, composed of compressed msgpack.
	The MsgPack has two objects, the hard dict and the soft dict.

	Example usage:
	>> import xuuid
	>> xuuid.xuuid("Person","TimKindberg",soft_akas=["champignon.net"])
	'oZKZJRMJN-s1l0-FoPw-Wcdq-ippr-iEhc-jgwI-4OjX-1RJS-ubez-SbOS-VgPF-sh6Z-t6OW-dTKg-KlVP-bxk27dtfebE'
	>> xuuid.parse("9F8oYJtH1-woJE-OBQ9-aRdA-gAfk-ZZJ5-pfH8-oXQi-AwUn-UhWl-vsdn-Cb3c-em1d-PvqW-TrLf-7A7Z-zA3N-Umox-Sgpx-v6aNZqYoN2dpEa")
	{
	'hard_type': 'Person',
	'hard_info': 'TimKindberg',
	'creation_time': 1662135109456161279,
	'randomness': '3a821d41b7',
	'soft_akas': ['champignon.net']
	}
	"""

	import zlib
	from sys import exit
	from time import time_ns
	from collections import deque
	from secrets import token_bytes as random_bytes

	try:
	import msgpack
	except ImportError:
	print("Error: msgpack not installed. This is required. pip install msgpack")
	exit(1)


	BASE_TIME = 1662125559016967741
	BASE_62_ALPHABET = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"


	def compress(data: bytes) -> bytes:
	compressed = zlib.compress(data, level=9)
	if len(compressed) < len(data):
	return compressed
	return data


	def decompress(data: bytes) -> bytes:
	return zlib.decompress(data)


	def add_dashes(data: str) -> str:
	char_index = 0
	data_with_dashes = ""
	for char in data:
	is_fourth = char_index % 4 == 0
	is_first = char_index == 0
	is_within_last_bound = char_index > len(data) - 12
	is_within_first_bound = char_index < 8
	data_with_dashes += char
	if is_fourth and not is_first and not (is_within_last_bound or is_within_first_bound):
	data_with_dashes += "-"
	char_index += 1
	return data_with_dashes


	def reversible_string_data_interleaving(data: str, reverse=False) -> str:
	# rotates the data by -10 characters using deque, unless reverse is True, in which case it rotates it back:
	data = deque(data)
	if reverse:
	data.rotate(-10)
	else:
	data.rotate(+10)
	return "".join(data)


	def bytes_to_base_62(data: bytes) -> str:
	"""Convert bytes to base-62 format for easy copy-pasting (unlike base-64)."""
	result = ""
	num = int.from_bytes(data, byteorder="big")
	while num > 0:
	result = BASE_62_ALPHABET[num % 62] + result
	num //= 62
	return result


	def base_62_to_bytes(data: str) -> bytes:
	"""Convert base-62 format to bytes."""
	num = 0
	for char in data:
	num = num * 62 + BASE_62_ALPHABET.index(char)
	return num.to_bytes((num.bit_length() + 7) // 8, byteorder="big")


	def xuuid(
	hard_type: str,
	hard_info: str,
	soft_sources: list = None,
	soft_auth: list = None,
	soft_akas: list = None,
	soft_short: str = None,
	soft_desc: str = None,
	soft_info: str = None,
	creation_time: int = None,
	randomness: bytes = None,
	random_byte_count: int = 5,
	use_compression: bool = True,
	) -> str:
	"""
	Create an xuuid.

	hard_type -- a string describing the type of the thing being identified
	hard_info -- a string describing the thing being identified
	soft_sources -- a list of URLs where information about the thing being identified can be found
	soft_auth -- a list of authentication information for verifying identity and correctness of some assertion
	soft_akas -- a list of other identifiers that are asserted to be equivalent to this identifier
	soft_short -- a suggested short-name for the thing being identified
	soft_desc -- additional descriptive information about the thing being identified
	soft_info -- additional general information about the thing being identified
	creation_time -- an integer of the time the identifier was created (defaults to now)
	randomness -- WARNING, ONLY USE FOR TESTS OR IF YOU KNOW WHAT YOU ARE DOING! random bytes to ensure uniqueness (defaults to a random byte string)
	random_byte_count -- the number of random bytes to use (defaults to 8)
	use_compression -- whether to compress the data (defaults to True)
	"""
	if not creation_time:
	creation_time = time_ns()
	if not randomness:
	randomness = random_bytes(random_byte_count)
	seconds = creation_time - BASE_TIME # seconds since base time
	hard_list = [hard_type, hard_info, seconds, randomness]
	# we make a dict from the optional keyword arguments, and then convert it to a msgpack object
	softDataDict = {}
	for local_variable_name, variable_value in locals().items():
	if local_variable_name.startswith("soft_") and variable_value is not None:
	softDataDict[local_variable_name[5:]] = variable_value

	uncompressed = msgpack.dumps([hard_list, softDataDict])
	if use_compression:
	smallest_bytes = compress(uncompressed)
	else:
	smallest_bytes = uncompressed
	based = reversible_string_data_interleaving(
	bytes_to_base_62(smallest_bytes))
	dashed_data = add_dashes(based)
	return dashed_data


	def parse(xuuid: str) -> dict:
	"""
	Parse an xuuid.

	xuuid -- the base62 encoded xuuid to parse
	"""
	# remove all the "-" dashes in the xuuid
	xuuid = xuuid.replace("-", "")
	uninterleaved = reversible_string_data_interleaving(xuuid, True)
	xuuid_bytes = base_62_to_bytes(uninterleaved)

	# Try decompressing the data, if it fails, it is not compressed
	try:
	uncompressed = decompress(xuuid_bytes)
	except:
	uncompressed = xuuid_bytes

	hard_list, soft_dict = msgpack.loads(uncompressed)
	# add "soft_" prefix to each key in the soft_dict:
	soft_dict = {
	"soft_" + key: value for key, value in soft_dict.items()
	}
	hard_dict = {"hard_type": hard_list[0], "hard_info": hard_list[1],
	"creation_time": hard_list[2] + BASE_TIME, "randomness": hard_list[3].hex()}
	combined_dict = {hard_dict, soft_dict}
	return combined_dict