Created
September 6, 2019 16:36
-
-
Save thorwhalen/5c903b6b541f435a2d6a58924a9da869 to your computer and use it in GitHub Desktop.
Tools to make pronounceable strings from numbers or other obscure ids.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from itertools import cycle, islice | |
import re | |
ascii_alphabet = 'abcdefghijklmnopqrstuvwxyz' | |
alpha_numerics = 'abcdefghijklmnopqrstuvwxyz0123456789' | |
vowels = 'aeiou' | |
consonants = 'bcdfghjklmnpqrstvwxyz' | |
vowels_and_consonants = (vowels, consonants) | |
def number_to_multi_base(n, b): | |
""" | |
Convert a number to a multi-base (generalization of base projection). | |
Args: | |
n: The number to convert | |
b: The base to convert it to | |
Returns: A list representing the number in the desired base. | |
# When b is just one number, it's the base (for e.g. b=2 means binary base) | |
>>> number_to_multi_base(3, 2) | |
[1, 1] | |
>>> number_to_multi_base(4, 2) | |
[1, 0, 0] | |
>>> number_to_multi_base(5, 2) | |
[1, 0, 1] | |
# But the cool thing about number_to_multi_base is that you can have a more complex base (any iterable, really) | |
>>> number_to_multi_base(11, [2, 3]) | |
[1, 2, 1] | |
>>> number_to_multi_base(12, [2, 3]) | |
[1, 0, 0, 0] | |
>>> number_to_multi_base(13, [2, 3]) | |
[1, 0, 0, 1] | |
>>> number_to_multi_base(14, [2, 3]) | |
[1, 0, 1, 0] | |
>>> number_to_multi_base(15, [2, 3]) | |
[1, 0, 1, 1] | |
>>> number_to_multi_base(16, [2, 3]) | |
[1, 0, 2, 0] | |
""" | |
if isinstance(b, (int, float)): | |
b = [b] | |
base = cycle(b) | |
if n == 0: | |
return [0] | |
digits = [] | |
while n: | |
b = next(base) | |
digits.append(int(n % b)) | |
n //= b | |
return digits[::-1] | |
def str_from_num_list(coord, symbols_for_base_idx=vowels_and_consonants, base_phase=0): | |
""" | |
Make a string from the coordinates (a) of a number in a given base system (infered from symbols_for_base_idx and | |
base_phase). | |
NOTE: symbols_for_base_idx sets should (in most cases) all be disjoint (but this is not validated!) | |
Args: | |
coord: An array of integers. Coordinates of a number in a given base system | |
base_phase: Which base (of symbols_for_base_idx) to start with (and then cycle) | |
symbols_for_base_idx: Sets of symbols for each base | |
Returns: | |
A string (which is the mapping of the number (represented by coord). | |
>>> str_from_num_list([1,2,1,2], ['ae', 'xyz']) | |
'ezez' | |
>>> str_from_num_list([1,2,1,0], ['ae', 'xyz']) | |
'ezex' | |
>>> | |
>>> # [1,2,0,1] is [1,2,1,0], with the last two digits flipped, but you don't get ezxe in the following: | |
>>> str_from_num_list([1,2,0,1], ['ae', 'xyz']) | |
'ezay' | |
""" | |
n = len(symbols_for_base_idx) | |
s = '' | |
for letter_idx, collection_idx in zip(coord, islice(cycle(range(n)), base_phase, None)): | |
# print(f"{letter_idx} === {collection_idx}") | |
s += symbols_for_base_idx[collection_idx][letter_idx] | |
return s | |
# TODO: Look into coverage. Couldn't produce 'magic' with ['ai', 'mgc'] or ['mgc', 'ai'] | |
def text_for_num(num, symbols_for_base_idx=vowels_and_consonants): | |
""" | |
Map a number to a string. | |
The map is bijective (a.k.a. "1-to-1" if the set of symbols in symbols_for_base_idx are non-overlapping. | |
Args: | |
num: A number to map to text | |
symbols_for_base_idx: The sets of symbols to use: A list of strings, each string representing a | |
collection of symbols to use in each base. | |
Returns: | |
A string representing the input number. | |
>>> # using the default symbols_for_base_idx (vowels and consonants): | |
>>> text_for_num(1060) | |
'caca' | |
>>> text_for_num(14818) | |
'sapu' | |
>>> text_for_num(335517) | |
'tecon' | |
>>> | |
>>> # using custom ones: | |
>>> text_for_num(153, ['ai', 'gcm']) | |
'magic' | |
""" | |
base_cardinalities = list(map(len, symbols_for_base_idx)) | |
n_bases = len(base_cardinalities) | |
base_phase = num % n_bases | |
num = (num - base_phase) // n_bases | |
base = list(islice(cycle(base_cardinalities), base_phase, n_bases + base_phase)) | |
coord = number_to_multi_base(num, base) | |
return str_from_num_list(coord[::-1], symbols_for_base_idx, base_phase)[::-1] | |
inf = float('infinity') | |
def text_to_pronounceable_text(text, | |
symbols_for_base_idx=vowels_and_consonants, | |
captured_alphabet=alpha_numerics, | |
case_sensitive=False, | |
max_word_length=30, | |
artificial_word_sep='_', | |
assert_no_word_sep_in_text=False | |
): | |
""" | |
Args: | |
text: text you want to map | |
symbols_for_base_idx: symbols you want to map TO (default is vowels and consonants) | |
captured_alphabet: the symbols of the words you want to map FROM (essentially, in contrast to filler characters) | |
case_sensitive: Whether the input text should be lower cased before being processed | |
max_word_length: The maximum length of a pronounceable word | |
artificial_word_sep: The separator to separate pronounceable words when the word is too long | |
assert_no_word_sep_in_text: Whether to assert that artificial_word_sep is not already in the input text | |
(to avoid clashing and non-invertibility) | |
Returns: | |
A more pronounceable text, where pronounceable is defined by you, so not my fault if it's not. | |
>>> text_to_pronounceable_text('asd8098 098df') | |
'izokagamuta osuhoju' | |
>>> text_to_pronounceable_text('asd8098 098df', max_word_length=4, artificial_word_sep='_') | |
'izo_kaga_muta osu_hoju' | |
""" | |
if not case_sensitive: | |
text = text.lower() | |
p = re.compile(f'[{captured_alphabet}]+') # to match the text to be mapped | |
anti_p = re.compile(f'[^{captured_alphabet}]+') # to match the chunks of separator (not matched) text | |
matched_text = anti_p.split(text) | |
num_of_character = {c: i for i, c in enumerate(captured_alphabet)} # the numerical mapping of alphabet | |
base_n = len(captured_alphabet) | |
# function to get the (base_n) number for a chk | |
num_of_chk = lambda chk: sum(num_of_character[c] * (base_n ** i) for i, c in enumerate(chk)) | |
_text_for_num = lambda num: text_for_num(num, symbols_for_base_idx) | |
pronounceable_words = [_text_for_num(num_of_chk(chk)) for chk in matched_text] | |
if max_word_length < inf: | |
def post_process_word(word): | |
if len(word) > max_word_length: | |
if assert_no_word_sep_in_text: | |
assert artificial_word_sep not in text, \ | |
f"Your artificial_word_sep ({artificial_word_sep}) was in the text (so no bijective mapping)" | |
r = (len(word) % max_word_length) | |
word_suffix = word[:r] | |
word_prefix = word[r:] | |
word = artificial_word_sep.join(map(''.join, zip(*([iter(word_prefix)] * max_word_length)))) | |
if word_suffix: | |
word = word_suffix + artificial_word_sep + word | |
return word | |
else: | |
return word | |
pronounceable_words = list(map(post_process_word, pronounceable_words)) | |
separator_text = p.split(text) | |
if len(pronounceable_words) < len(separator_text): | |
return ''.join(map(''.join, zip(separator_text, pronounceable_words))) | |
else: | |
return ''.join(map(''.join, zip(pronounceable_words, separator_text))) | |
class FunTests: | |
@staticmethod | |
def print_sequences_in_columns(start_num=3000, end_num=3060): | |
for i in range(start_num, end_num): | |
# print(f"-----{i}") | |
if i % 2: | |
print("".join(map(str, (text_for_num(i))))) | |
else: | |
print("\t" + "".join(map(str, (text_for_num(i))))) | |
if __name__ == '__main__': | |
try: | |
import argh | |
except ImportError: | |
raise ImportError("You don't have argh. You can install it by doing:\n" | |
" pip install argh\n" | |
"In your terminal/environment,") | |
argh.dispatch_command(text_to_pronounceable_text) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment