Skip to content

Instantly share code, notes, and snippets.

@amintos
Created January 11, 2013 01:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save amintos/4507279 to your computer and use it in GitHub Desktop.
Save amintos/4507279 to your computer and use it in GitHub Desktop.
Module providing Hiragana-like encodings for arbitrary data.
"""
KADOKUSEI Number and Byte-Stream Representation
(c) 2013 | Toni Mattis | MIT Licensed
This code allows abstract numbers, i.e. coordinates, public keys or hashes
to be represented in a pronounceable way. Composition is based on Hiragana.
Example:
>>> encode_number(718428)
'zusukyu'
>>> decode_number('zusukyu')
718428
# Avoid phonetic similarities (k/g, s/z, t/d, v/w, ...) by setting
# the safe parameter to True. May increase code length:
>>> encode_number(718428, safe=True)
'tekyasho'
>>> decode_number('tekyasho', safe=True)
718428
# Encode longer strings interpreted as binary data. Supports custom
# delimiters and phonetic safety:
>>> encode_string("This is not the string you're looking for!")
'ken idajiryon unyoryuryo oshahowen osanin tsutoryo osa ipebyagin zachun...
>>> decode_string('on-eryomyamu-gyapuchi-umisu-egukyahyun',
safe=True, delimiter='-')
'H3LL0 W0RLD'
"""
START_TOKENS = [
'', 'a', 'i', 'u', 'e', 'o'
]
MID_TOKENS = [
'ka', 'ki', 'ku', 'ke', 'ko', 'kya', 'kyu', 'kyo',
'sa', 'shi', 'su', 'se', 'so', 'sha', 'shu', 'sho',
'ta', 'chi', 'tsu', 'te', 'to', 'cha', 'chu', 'cho',
'na', 'ni', 'nu', 'ne', 'no', 'nya', 'nyu', 'nyo',
'ha', 'hi', 'fu', 'he', 'ho', 'hya', 'hyu', 'hyo',
'ma', 'mi', 'mu', 'me', 'mo', 'mya', 'myu', 'myo',
'ya', 'yu', 'yo',
'ra', 'ri', 'ru', 're', 'ro', 'rya', 'ryu', 'ryo',
'wa', 'wi', 'we', 'wo',
]
EXT_TOKENS = MID_TOKENS + [
'ga', 'gi', 'gu', 'ge', 'go', 'gya', 'gyu', 'gyo',
'za', 'ji', 'zu', 'ze', 'zo', 'ja', 'ju', 'jo',
'da', 'de', 'do',
'ba', 'bi', 'bu', 'be', 'bo', 'bya', 'byu', 'byo',
'pa', 'pi', 'pu', 'pe', 'po', 'pya', 'pyu', 'pyo',
'vu',
]
END_TOKENS = ['', 'n']
MID_SIZE = len(MID_TOKENS)
EXT_SIZE = len(EXT_TOKENS)
def encode_number(n, safe=False):
"""Encodes a small number to a pronounceable KADOKUSEI-Code"""
TOKENS, SIZE = (MID_TOKENS, MID_SIZE) if safe else (EXT_TOKENS, EXT_SIZE)
# The code starts with a vowel if n is not divisible by 6
start = START_TOKENS[n % 6]
n /= 6
# The code ends with an 'n' if the remainder is odd
end = END_TOKENS[n % 2]
n /= 2
# The more significant information is translated by syllabary
mid = ''
while n > 0:
mid += TOKENS[n % SIZE]
n /= SIZE
return start + mid + end
# Inverts those lists to a dictionary mapping list-items onto their index
INDEX = lambda lst: {k : v for v, k in enumerate(lst)}
START_INVERTED = INDEX(START_TOKENS)
MID_INVERTED = INDEX(MID_TOKENS)
EXT_INVERTED = INDEX(EXT_TOKENS)
END_INVERTED = INDEX(END_TOKENS)
def decode_number(text, safe=False):
"""Decodes a pronounceable KADOKUSEI-Code to a number"""
result = 0
INV, SIZE = (MID_INVERTED, MID_SIZE) if safe else (EXT_INVERTED, EXT_SIZE)
if not text:
return 0
start = text[0]
if start in START_INVERTED:
run = 1
else:
start = ''
run = 0
limit = len(text) - 1
end = text[-1]
if end in END_INVERTED:
limit = len(text) - 1
else:
end = ''
limit = len(text)
base = 1
while run < limit:
# try 2 letters
part = text[run : run + 2]
part_value = INV.get(part, None)
if part_value == None:
# opt for 3 letters at once
part = text[run : run + 3]
part_value = INV.get(part, None)
if part_value == None:
raise ValueError, "Unrecognized substring: %s" % part
result += part_value * base
base *= SIZE
run += len(part)
return result * 12 + END_INVERTED[end] * 6 + START_INVERTED[start]
# ------------------------------------------------------------------------------
#
# FULL STRING ENCODING
#
# ------------------------------------------------------------------------------
# (side note:
# The following generator yields different byte lengths in which the input
# is chunked. The sequence of chunk lengths itself is an error detection code.
def _default_generator():
s = 2
while True:
n = yield (s % 3) + 1
s = (s * s + n + 1337) % 65537
def encode_string(s, delimiter=' ', safe=False, chunk_generator=None):
"""Encodes an arbitrary string into words of the KADOKUSEI-Code"""
generator = chunk_generator or _default_generator()
chunksize = generator.next()
i = 0
n = len(s)
result = [encode_number(n)]
while i < n:
buf = 0
for j in range(min(chunksize, n - i)):
buf |= ord(s[i]) << (8 * j)
i += 1
result.append(encode_number(buf))
chunksize = generator.send(buf)
if i + chunksize >= n: chunksize = n - i
return delimiter.join(result)
def decode_string(s, delimiter=' ', safe=False, chunk_generator=None):
"""Decodes a string represented in KADOKUSEI-Code"""
generator = chunk_generator or _default_generator()
chunksize = generator.next()
result = []
it = iter(s.split(delimiter))
size = decode_number(it.next())
for chunk in it:
buf = orig_buf = decode_number(chunk)
for j in range(chunksize):
result.append(chr(buf & 0xFF))
buf >>= 8
if buf > 0:
raise ValueError, "Code deemed invalid"
chunksize = generator.send(orig_buf)
return ''.join(result[:size])
# ------------------------------------------------------------------------------
#
# SELF TEST WHEN INVOKED STANDALONE
#
# ------------------------------------------------------------------------------
if __name__ == "__main__":
import os, random
print "Running quick test..."
for i in xrange(20):
r = random.randint(2**i, 2**(i + 2))
print r, encode_number(r), r == decode_number(encode_number(r)) and 'OK'
print r, encode_number(r, safe=True), \
r == decode_number(encode_number(r, safe=True),
safe=True) and 'OK'
print "done. Running stress test...",
for i in xrange(100):
for j in xrange(100):
r = os.urandom(i)
assert decode_string(encode_string(r)) == r
print "done."
# -----------------------------------------------------------------------------
# Copyright (C) 2013 | Toni Mattis | Licensed under the MIT License
# -----------------------------------------------------------------------------
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment