amintos/kadokusei.py

## kadokusei.py
"""
KADOKUSEI Number and Byte-Stream Representation
(c) 2013 | Toni Mattis | MIT Licensed

This code allows abstract numbers, i.e. coordinates, public keys or hashes
to be represented in a pronounceable way. Composition is based on Hiragana.

Example:

    >>> encode_number(718428)
    'zusukyu'

    >>> decode_number('zusukyu')
    718428


    # Avoid phonetic similarities (k/g, s/z, t/d, v/w, ...) by setting
    # the safe parameter to True. May increase code length:

    >>> encode_number(718428, safe=True)
    'tekyasho'

    >>> decode_number('tekyasho', safe=True)
    718428


    # Encode longer strings interpreted as binary data. Supports custom
    # delimiters and phonetic safety:

    >>> encode_string("This is not the string you're looking for!")
    'ken idajiryon unyoryuryo oshahowen osanin tsutoryo osa ipebyagin zachun...

    >>> decode_string('on-eryomyamu-gyapuchi-umisu-egukyahyun',
                      safe=True, delimiter='-')
    'H3LL0 W0RLD'


"""

START_TOKENS = [
    '', 'a',	'i',	'u',	'e',	'o'
]

MID_TOKENS = [
    'ka',	'ki',	'ku',	'ke',	'ko',   'kya',  'kyu',  'kyo',
    'sa',	'shi',	'su',	'se',	'so',   'sha',  'shu',  'sho',
    'ta',	'chi',	'tsu',	'te',	'to',   'cha',  'chu',  'cho',
    'na',	'ni',	'nu',	'ne',	'no',   'nya',  'nyu',  'nyo',
    'ha',	'hi',	'fu',	'he',	'ho',   'hya',  'hyu',  'hyo',
    'ma',	'mi',	'mu',	'me',	'mo',   'mya',  'myu',  'myo',
    'ya',	        'yu',           'yo',
    'ra',	'ri',	'ru',	're',	'ro',   'rya',  'ryu',  'ryo',
    'wa',	'wi',	        'we',   'wo',
]

EXT_TOKENS = MID_TOKENS + [
    'ga',       'gi',   'gu',   'ge',   'go',   'gya',  'gyu',  'gyo',
    'za',       'ji',   'zu',   'ze',   'zo',   'ja',   'ju',   'jo',
    'da',                       'de',   'do',
    'ba',       'bi',   'bu',   'be',   'bo',   'bya',  'byu',  'byo',
    'pa',       'pi',   'pu',   'pe',   'po',   'pya',  'pyu',  'pyo',
                        'vu',
]

END_TOKENS = ['', 'n']

MID_SIZE = len(MID_TOKENS)
EXT_SIZE = len(EXT_TOKENS)

def encode_number(n, safe=False):
    """Encodes a small number to a pronounceable KADOKUSEI-Code"""
    TOKENS, SIZE = (MID_TOKENS, MID_SIZE) if safe else (EXT_TOKENS, EXT_SIZE)

    # The code starts with a vowel if n is not divisible by 6
    start = START_TOKENS[n % 6]
    n /= 6

    # The code ends with an 'n' if the remainder is odd
    end = END_TOKENS[n % 2]
    n /= 2

    # The more significant information is translated by syllabary
    mid = ''
    while n > 0:
        mid += TOKENS[n % SIZE]
        n /= SIZE

    return start + mid + end

# Inverts those lists to a dictionary mapping list-items onto their index
INDEX = lambda lst: {k : v for v, k in enumerate(lst)}

START_INVERTED = INDEX(START_TOKENS)
MID_INVERTED = INDEX(MID_TOKENS)
EXT_INVERTED = INDEX(EXT_TOKENS)
END_INVERTED = INDEX(END_TOKENS)

def decode_number(text, safe=False):
    """Decodes a pronounceable KADOKUSEI-Code to a number"""
    result = 0
    INV, SIZE = (MID_INVERTED, MID_SIZE) if safe else (EXT_INVERTED, EXT_SIZE)

    if not text:
        return 0

    start = text[0]
    if start in START_INVERTED:
        run = 1
    else:
        start = ''
        run = 0

    limit = len(text) - 1
    end = text[-1]
    if end in END_INVERTED:
        limit = len(text) - 1
    else:
        end = ''
        limit = len(text)

    base = 1
    while run < limit:

        # try 2 letters
        part = text[run : run + 2]
        part_value = INV.get(part, None)

        if part_value == None:
            # opt for 3 letters at once
            part = text[run : run + 3]
            part_value = INV.get(part, None)
            if part_value == None:
                raise ValueError, "Unrecognized substring: %s" % part

        result += part_value * base
        base *= SIZE
        run += len(part)

    return result * 12 + END_INVERTED[end] * 6 + START_INVERTED[start]

# ------------------------------------------------------------------------------
#
#   FULL STRING ENCODING
#
# ------------------------------------------------------------------------------

# (side note:
# The following generator yields different byte lengths in which the input
# is chunked. The sequence of chunk lengths itself is an error detection code.

def _default_generator():
    s = 2
    while True:
        n = yield (s % 3) + 1
        s = (s * s + n + 1337) % 65537


def encode_string(s, delimiter=' ', safe=False, chunk_generator=None):
    """Encodes an arbitrary string into words of the KADOKUSEI-Code"""

    generator = chunk_generator or _default_generator()
    chunksize = generator.next()

    i = 0
    n = len(s)
    result = [encode_number(n)]

    while i < n:
        buf = 0
        for j in range(min(chunksize, n - i)):
            buf |= ord(s[i]) << (8 * j)
            i += 1
        result.append(encode_number(buf))
        chunksize = generator.send(buf)
        if i + chunksize >= n: chunksize = n - i

    return delimiter.join(result)

def decode_string(s, delimiter=' ', safe=False, chunk_generator=None):
    """Decodes a string represented in KADOKUSEI-Code"""

    generator = chunk_generator or _default_generator()
    chunksize = generator.next()
    result = []

    it = iter(s.split(delimiter))
    size = decode_number(it.next())

    for chunk in it:
        buf = orig_buf = decode_number(chunk)
        for j in range(chunksize):
            result.append(chr(buf & 0xFF))
            buf >>= 8

        if buf > 0:
            raise ValueError, "Code deemed invalid"
        chunksize = generator.send(orig_buf)

    return ''.join(result[:size])

# ------------------------------------------------------------------------------
#
#   SELF TEST WHEN INVOKED STANDALONE
#
# ------------------------------------------------------------------------------

if __name__ == "__main__":
    import os, random

    print "Running quick test..."
    for i in xrange(20):
        r = random.randint(2**i, 2**(i + 2))
        print r, encode_number(r), r == decode_number(encode_number(r)) and 'OK'

        print r, encode_number(r, safe=True), \
              r == decode_number(encode_number(r, safe=True),
                  safe=True) and 'OK'


    print "done. Running stress test...",
    for i in xrange(100):
        for j in xrange(100):
            r = os.urandom(i)
            assert decode_string(encode_string(r)) == r
    print "done."


# -----------------------------------------------------------------------------
# Copyright (C) 2013 | Toni Mattis | Licensed under the MIT License
# -----------------------------------------------------------------------------
	"""
	KADOKUSEI Number and Byte-Stream Representation
	(c) 2013 \| Toni Mattis \| MIT Licensed

	This code allows abstract numbers, i.e. coordinates, public keys or hashes
	to be represented in a pronounceable way. Composition is based on Hiragana.

	Example:

	>>> encode_number(718428)
	'zusukyu'

	>>> decode_number('zusukyu')
	718428


	# Avoid phonetic similarities (k/g, s/z, t/d, v/w, ...) by setting
	# the safe parameter to True. May increase code length:

	>>> encode_number(718428, safe=True)
	'tekyasho'

	>>> decode_number('tekyasho', safe=True)
	718428


	# Encode longer strings interpreted as binary data. Supports custom
	# delimiters and phonetic safety:

	>>> encode_string("This is not the string you're looking for!")
	'ken idajiryon unyoryuryo oshahowen osanin tsutoryo osa ipebyagin zachun...

	>>> decode_string('on-eryomyamu-gyapuchi-umisu-egukyahyun',
	safe=True, delimiter='-')
	'H3LL0 W0RLD'


	"""

	START_TOKENS = [
	'', 'a', 'i', 'u', 'e', 'o'
	]

	MID_TOKENS = [
	'ka', 'ki', 'ku', 'ke', 'ko', 'kya', 'kyu', 'kyo',
	'sa', 'shi', 'su', 'se', 'so', 'sha', 'shu', 'sho',
	'ta', 'chi', 'tsu', 'te', 'to', 'cha', 'chu', 'cho',
	'na', 'ni', 'nu', 'ne', 'no', 'nya', 'nyu', 'nyo',
	'ha', 'hi', 'fu', 'he', 'ho', 'hya', 'hyu', 'hyo',
	'ma', 'mi', 'mu', 'me', 'mo', 'mya', 'myu', 'myo',
	'ya', 'yu', 'yo',
	'ra', 'ri', 'ru', 're', 'ro', 'rya', 'ryu', 'ryo',
	'wa', 'wi', 'we', 'wo',
	]

	EXT_TOKENS = MID_TOKENS + [
	'ga', 'gi', 'gu', 'ge', 'go', 'gya', 'gyu', 'gyo',
	'za', 'ji', 'zu', 'ze', 'zo', 'ja', 'ju', 'jo',
	'da', 'de', 'do',
	'ba', 'bi', 'bu', 'be', 'bo', 'bya', 'byu', 'byo',
	'pa', 'pi', 'pu', 'pe', 'po', 'pya', 'pyu', 'pyo',
	'vu',
	]

	END_TOKENS = ['', 'n']

	MID_SIZE = len(MID_TOKENS)
	EXT_SIZE = len(EXT_TOKENS)

	def encode_number(n, safe=False):
	"""Encodes a small number to a pronounceable KADOKUSEI-Code"""
	TOKENS, SIZE = (MID_TOKENS, MID_SIZE) if safe else (EXT_TOKENS, EXT_SIZE)

	# The code starts with a vowel if n is not divisible by 6
	start = START_TOKENS[n % 6]
	n /= 6

	# The code ends with an 'n' if the remainder is odd
	end = END_TOKENS[n % 2]
	n /= 2

	# The more significant information is translated by syllabary
	mid = ''
	while n > 0:
	mid += TOKENS[n % SIZE]
	n /= SIZE

	return start + mid + end

	# Inverts those lists to a dictionary mapping list-items onto their index
	INDEX = lambda lst: {k : v for v, k in enumerate(lst)}

	START_INVERTED = INDEX(START_TOKENS)
	MID_INVERTED = INDEX(MID_TOKENS)
	EXT_INVERTED = INDEX(EXT_TOKENS)
	END_INVERTED = INDEX(END_TOKENS)

	def decode_number(text, safe=False):
	"""Decodes a pronounceable KADOKUSEI-Code to a number"""
	result = 0
	INV, SIZE = (MID_INVERTED, MID_SIZE) if safe else (EXT_INVERTED, EXT_SIZE)

	if not text:
	return 0

	start = text[0]
	if start in START_INVERTED:
	run = 1
	else:
	start = ''
	run = 0

	limit = len(text) - 1
	end = text[-1]
	if end in END_INVERTED:
	limit = len(text) - 1
	else:
	end = ''
	limit = len(text)

	base = 1
	while run < limit:

	# try 2 letters
	part = text[run : run + 2]
	part_value = INV.get(part, None)

	if part_value == None:
	# opt for 3 letters at once
	part = text[run : run + 3]
	part_value = INV.get(part, None)
	if part_value == None:
	raise ValueError, "Unrecognized substring: %s" % part

	result += part_value * base
	base *= SIZE
	run += len(part)

	return result * 12 + END_INVERTED[end] * 6 + START_INVERTED[start]

	# ------------------------------------------------------------------------------
	#
	# FULL STRING ENCODING
	#
	# ------------------------------------------------------------------------------

	# (side note:
	# The following generator yields different byte lengths in which the input
	# is chunked. The sequence of chunk lengths itself is an error detection code.

	def _default_generator():
	s = 2
	while True:
	n = yield (s % 3) + 1
	s = (s * s + n + 1337) % 65537


	def encode_string(s, delimiter=' ', safe=False, chunk_generator=None):
	"""Encodes an arbitrary string into words of the KADOKUSEI-Code"""

	generator = chunk_generator or _default_generator()
	chunksize = generator.next()

	i = 0
	n = len(s)
	result = [encode_number(n)]

	while i < n:
	buf = 0
	for j in range(min(chunksize, n - i)):
	buf \|= ord(s[i]) << (8 * j)
	i += 1
	result.append(encode_number(buf))
	chunksize = generator.send(buf)
	if i + chunksize >= n: chunksize = n - i

	return delimiter.join(result)

	def decode_string(s, delimiter=' ', safe=False, chunk_generator=None):
	"""Decodes a string represented in KADOKUSEI-Code"""

	generator = chunk_generator or _default_generator()
	chunksize = generator.next()
	result = []

	it = iter(s.split(delimiter))
	size = decode_number(it.next())

	for chunk in it:
	buf = orig_buf = decode_number(chunk)
	for j in range(chunksize):
	result.append(chr(buf & 0xFF))
	buf >>= 8

	if buf > 0:
	raise ValueError, "Code deemed invalid"
	chunksize = generator.send(orig_buf)

	return ''.join(result[:size])

	# ------------------------------------------------------------------------------
	#
	# SELF TEST WHEN INVOKED STANDALONE
	#
	# ------------------------------------------------------------------------------

	if __name__ == "__main__":
	import os, random

	print "Running quick test..."
	for i in xrange(20):
	r = random.randint(2i, 2(i + 2))
	print r, encode_number(r), r == decode_number(encode_number(r)) and 'OK'

	print r, encode_number(r, safe=True), \
	r == decode_number(encode_number(r, safe=True),
	safe=True) and 'OK'


	print "done. Running stress test...",
	for i in xrange(100):
	for j in xrange(100):
	r = os.urandom(i)
	assert decode_string(encode_string(r)) == r
	print "done."


	# -----------------------------------------------------------------------------
	# Copyright (C) 2013 \| Toni Mattis \| Licensed under the MIT License
	# -----------------------------------------------------------------------------