Skip to content

Instantly share code, notes, and snippets.

@ccd0
Last active September 22, 2020 17:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ccd0/bff899edcba8b6a876c930da65acc5a0 to your computer and use it in GitHub Desktop.
Save ccd0/bff899edcba8b6a876c930da65acc5a0 to your computer and use it in GitHub Desktop.
Convert a search of phonetic tripcodes to a search of base64 tripcodes.
#!/usr/bin/env python3
# This program reads patterns to search for in a phonetic tripcode.
# It converts them into a list of regular expressions for matching
# the corresponding base64 tripcodes, which can be fed to
# tripcode engines such as Merikens to search for tripcode passwords.
#
# Search patterns are read on standard input, one per line, and the
# regular expressions are printed to standard output, one per line.
#
# Patterns that cannot be matched result in no output.
#
# The symbols ^, $, and \b are supported and have the same meaning
# as in regular expressions:
# ^ matches the beginning of the tripcode
# $ matches the end of the tripcode
# \b matches a word break
#
# For search purposes any leading ! is not considered part of the tripcode.
#
# If given the option 'forward' on the command line, the program will
# instead read the first item on each line that is formatted like
# a base-64 tripcode, and print the tripcode in phonetic encoding.
# This is useful for finding out what the tripcodes chosen by
# a tripcode engine will be displayed as.
#
# If given the option 'filter' on the command line, the program will
# print only the input patterns that can be matched.
# -----------------------------------------------------------------------------
# Copyright (c) 2020 contributors
#
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
# AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
# INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
# PERFORMANCE OF THIS SOFTWARE.
# -----------------------------------------------------------------------------
import re, itertools, base64, sys, unittest
CONSONANTS = 'bcdfghjklmnpqrstvwxz'
VOWELS = 'aeiouy'
BASE64_ALPHABET = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789./'
TRIPCODE_END_CHARS = '.26AEIMQUYcgkosw'
WORD_BOUNDARY_TOKENS = ['^', '$', '-', r'\b']
LOOKUP_MID = [c + v for c in CONSONANTS.replace('q', '') for v in VOWELS] + [
'cha', 'che', 'sha', 'the', 'sta', 'ste', 'tri', 'lee', 'bra', 'lia', 'gla', 'lea', 'dia', 'gai',
'mia', 'ree', 'phi', 'die', 'lei', 'sie', 'lai', 'dwi', 'sca', 'pri', 'tha', 'rey', 'roy', 'kay', 'que', 'cia',
'cie', 'bri', 'rya', 'sco', 'lio', 'sto', 'nia', 'tie', 'she', 'dua', 'ria', 'tho', 'tia', 'nya', 'spe', 'ley',
'vio', 'qua', 'loi', 'qui', 'bia', 'sey', 'chi', 'lay', 'nee', 'dra', 'via', 'teo', 'fre', 'kie', 'kia', 'saa',
'sea', 'sia', 'cly', 'rai', 'sue', 'mee', 'mya', 'nie', 'shi', 'wre', 'sho', 'rio', 'dee', 'vie', 'dwa', 'cla',
'tra', 'ble', 'kai', 'lie', 'lya', 'ney', 'sla', 'blo', 'nai', 'way', 'smi', 'kae', 'kee', 'phe', 'rae', 'jua',
'nea', 'mai', 'zia', 'rye', 'kei', 'lle', 'nei', 'mie', 'cho', 'tya', 'nay', 'nye', 'rea', 'pha', 'deo', 'cle',
'tzi', 'lou', 'dre', 'khi', 'llu', 'fae', 'mou', 'cre', 'loy', 'sai', 'mae', 'rie', 'tre', 'cae', 'nyo', 'leo',
'ceo', 'ray', 'maa', 'shu', 'veo', 'naa', 'lau', 'gra', 'sti', 'xia', 'dio', 'gro', 'thi', 'gio', 'see', 'vau'
]
LOOKUP_BEG = [s.replace('x', '') for s in LOOKUP_MID]
LOOKUP_END = [''] + list('bcdfghklmnrstxz')
LOOKUP = 2 * [LOOKUP_BEG, LOOKUP_MID, LOOKUP_MID, LOOKUP_END]
class TestConstants(unittest.TestCase):
def test_letters(self):
self.assertEqual(len(CONSONANTS + VOWELS), 26)
def test_base64(self):
self.assertEqual(len(BASE64_ALPHABET), 64)
def test_lookup(self):
self.assertEqual([len(choices) for choices in LOOKUP], [2**n for n in [8,8,8,4,8,8,8,4]])
# Break up search pattern into syllables and special tokens ^ $ - \b.
# Invalid characters are passed through as tokens; they will cause a match failure later in find_syllables.
def tokenize(pattern):
return re.findall(r'[\^$-]|\\b|[{0}]+[{1}]*|[{1}]+|.'.format(CONSONANTS, VOWELS), pattern, flags=re.DOTALL)
class TestTokenize(unittest.TestCase):
def test_tokenize(self):
self.assertEqual(tokenize(r'^agramaax-\btrevi*&la$'), ['^', 'a', 'gra', 'maa', 'x', '-', r'\b', 'tre', 'vi', '*', '&', 'la', '$'])
# Process special tokens ^ $ - \b. Determine:
# syllables: list of syllables to match with special tokens filtered out, including possibly empty final consonants
# alignments: list of possible starting positions for the syllables to match
# strict_beg: if first syllable to match must match the beginning of a syllable
# strict_end: if last syllable to match must match the end of a syllable
def find_alignment(tokens):
syllables = []
alignments = list(range(9))
strict_beg = len(tokens) > 0 and tokens[0] in WORD_BOUNDARY_TOKENS
strict_end = len(tokens) > 0 and tokens[-1] in WORD_BOUNDARY_TOKENS
for i, token in enumerate(tokens):
n = len(syllables)
# insert empty final consonant if we hit a word boundary after a syllable ending in a vowel
if token in WORD_BOUNDARY_TOKENS and i > 0 and tokens[i-1][-1] in VOWELS:
alignments = [a for a in alignments if a + n == 3 or a + n == 7]
syllables.append('')
n += 1
if token == '^':
alignments = [a for a in alignments if a + n == 0]
elif token == '$':
alignments = [a for a in alignments if a + n == 8]
elif token == '-':
alignments = [a for a in alignments if a + n == 4]
elif token == r'\b':
alignments = [a for a in alignments if a + n in [0, 4, 8]]
else:
if token[-1] in VOWELS:
alignments = [a for a in alignments if 0 <= a + n <= 2 or 4 <= a + n <= 6]
else:
alignments = [a for a in alignments if 0 <= a + n <= 7]
syllables.append(token)
return syllables, alignments, strict_beg, strict_end
class TestAlignment(unittest.TestCase):
def test_syl1(self):
self.assertEqual(find_alignment(tokenize('ba')), (['ba'], [0, 1, 2, 4, 5, 6], False, False))
def test_syl2(self):
self.assertEqual(find_alignment(tokenize('baba')), (2*['ba'], [0, 1, 4, 5], False, False))
def test_syl3(self):
self.assertEqual(find_alignment(tokenize('bababa')), (3*['ba'], [0, 4], False, False))
def test_syl4(self):
self.assertEqual(find_alignment(tokenize('babababa')), (4*['ba'], [], False, False))
def test_lone_vowel(self):
self.assertEqual(find_alignment(tokenize('a')), (['a'], [0, 1, 2, 4, 5, 6], False, False))
def test_lone_cons(self):
self.assertEqual(find_alignment(tokenize('b')), (['b'], [0, 1, 2, 3, 4, 5, 6, 7], False, False))
def test_final_cons(self):
self.assertEqual(find_alignment(tokenize('bab')), (['ba', 'b'], [0, 1, 2, 4, 5, 6], False, False))
def test_start(self):
self.assertEqual(find_alignment(tokenize('^ba')), (['ba'], [0], True, False))
def test_end_vowel(self):
self.assertEqual(find_alignment(tokenize('ba$')), (['ba', ''], [6], False, True))
def test_end_cons(self):
self.assertEqual(find_alignment(tokenize('bab$')), (['ba', 'b'], [6], False, True))
def test_whole(self):
self.assertEqual(find_alignment(tokenize('^bababa-bababa$')), (2*['ba', 'ba', 'ba', ''], [0], True, True))
def test_separator_pre_vowel(self):
self.assertEqual(find_alignment(tokenize('-a')), (['a'], [4], True, False))
def test_separator_pre_cons(self):
self.assertEqual(find_alignment(tokenize('-b')), (['b'], [4], True, False))
def test_separator_post_vowel(self):
self.assertEqual(find_alignment(tokenize('a-')), (['a', ''], [2], False, True))
def test_separator_post_cons(self):
self.assertEqual(find_alignment(tokenize('b-')), (['b'], [3], False, True))
def test_break_pre_vowel(self):
self.assertEqual(find_alignment(tokenize(r'\ba')), (['a'], [0, 4], True, False))
def test_break_pre_cons(self):
self.assertEqual(find_alignment(tokenize(r'\bb')), (['b'], [0, 4], True, False))
def test_break_post_vowel(self):
self.assertEqual(find_alignment(tokenize(r'a\b')), (['a', ''], [2, 6], False, True))
def test_break_post_cons(self):
self.assertEqual(find_alignment(tokenize(r'b\b')), (['b'], [3, 7], False, True))
def test_conflict(self):
self.assertEqual(find_alignment(tokenize(r'^baba-')), (['ba', 'ba', ''], [], True, True))
def test_syllable(haystack, needle, must_beg, must_end):
if must_beg:
if must_end:
return haystack == needle
else:
return haystack.startswith(needle)
else:
if must_end:
return haystack.endswith(needle)
else:
return needle in haystack
# For a given starting position of the syllables to match, find the list of possible syllables at each position in the tripcode.
# [None] indicates no restriction on the possible syllables at that position.
def find_syllables(syllables, align, strict_beg, strict_end):
syllable_choices = 8*[[None]]
for i, syl in enumerate(syllables):
j = i + align
must_beg = strict_beg or i > 0
must_end = strict_end or i < len(syllables) - 1
syllable_choices[j] = [n for n, syl2 in enumerate(LOOKUP[j]) if test_syllable(syl2, syl, must_beg, must_end)]
return syllable_choices
class TestFindSyllables(unittest.TestCase):
def test_rere(self):
self.assertEqual(find_syllables(['re', 're'], 1, False, False), [[None], [LOOKUP_MID.index(s) for s in ['re', 'fre', 'wre', 'dre', 'cre', 'tre']], [LOOKUP_MID.index(s) for s in ['re', 'ree', 'rey', 'rea']]] + 5*[[None]])
def test_re(self):
self.assertEqual(find_syllables(['re'], 1, False, False), [[None], [LOOKUP_MID.index(s) for s in ['re', 'ree', 'rey', 'fre', 'wre', 'rea', 'dre', 'cre', 'tre']]] + 6*[[None]])
def test_re_beg(self):
self.assertEqual(find_syllables(['re'], 1, True, False), [[None], [LOOKUP_MID.index(s) for s in ['re', 'ree', 'rey', 'rea']]] + 6*[[None]])
def test_re_end(self):
self.assertEqual(find_syllables(['re'], 1, False, True), [[None], [LOOKUP_MID.index(s) for s in ['re', 'fre', 'wre', 'dre', 'cre', 'tre']]] + 6*[[None]])
def test_re_exact(self):
self.assertEqual(find_syllables(['re'], 1, True, True), [[None], [LOOKUP_MID.index('re')]] + 6*[[None]])
def test_x(self):
self.assertEqual(find_syllables(['x'], 1, False, False), [[None], [LOOKUP_MID.index(s) for s in LOOKUP_MID if 'x' in s]] + 6*[[None]])
def test_x_wordbeg(self):
self.assertEqual(find_syllables(['x'], 0, False, False), [[]] + 7*[[None]])
def test_x_wordend(self):
self.assertEqual(find_syllables(['x'], 3, False, False), [[None], [None], [None], [LOOKUP_END.index('x')]] + 4*[[None]])
def test_e(self):
self.assertEqual(find_syllables(['e'], 1, False, False), [[None], [LOOKUP_MID.index(s) for s in LOOKUP_MID if 'e' in s]] + 6*[[None]])
def test_e_wordend(self):
self.assertEqual(find_syllables(['e'], 3, False, False), [[None], [None], [None], []] + 4*[[None]])
def test_invalid(self):
self.assertEqual(find_syllables(['*'], 1, False, False), [[None], []] + 6*[[None]])
def byte_to_bits(n):
return [(n >> (7-i)) & 1 for i in range(8)]
def bits_to_number(bits):
n = len(bits)
return sum(b << (n-1-i) for i, b in enumerate(bits))
class TestBits(unittest.TestCase):
def test_to_bits(self):
self.assertEqual(byte_to_bits(42), [0, 0, 1, 0, 1, 0, 1, 0])
def test_to_number(self):
self.assertEqual(bits_to_number([0, 0, 1, 0, 1, 0, 1, 0]), 42)
def test_roundtrip(self):
for n in range(256):
self.assertEqual(bits_to_number(byte_to_bits(n)), n)
# Expand a list of syllable numbers to a list of bits in the tripcode.
# None indicates no restrictions on the syllable / bit at that position.
def expand_bits(syllable_numbers):
# final consonants (syllable_numbers[3] and syllable_numbers[7]) are created by splitting a byte up; move them back together
numbers_original_order = syllable_numbers[0:4] + syllable_numbers[7:8] + syllable_numbers[4:7]
known_bits = []
for i, n in enumerate(numbers_original_order):
bits = 8*[None] if n is None else byte_to_bits(n)
if i == 3 or i == 4:
# final consonants are only 4 bits each
bits = bits[4:8]
known_bits += bits
# add 4 final bits to the 56 bits represented by the syllables to make a 10-digit base-64 string (60 bits)
known_bits += 4*[None]
return known_bits
class TestExpandBits(unittest.TestCase):
def test_expand(self):
self.assertEqual(expand_bits([None, 42, 42] + 5*[None]), 8*[None] + 2*[0, 0, 1, 0, 1, 0, 1, 0] + 36*[None])
def test_length(self):
self.assertEqual(len(expand_bits([None, 42, 42] + 5*[None])), 60)
# Convert list of known bits (None representing unknown) into a string of possible base-64 digits, or None if any digit is possible.
def possible_digits64(bit_slice):
if all(b is None for b in bit_slice):
return None
else:
bit_choices = [[0, 1] if b is None else [b] for b in bit_slice]
return ''.join(BASE64_ALPHABET[bits_to_number(bs)] for bs in itertools.product(*bit_choices))
class TestPossibleDigits64(unittest.TestCase):
def test_mixed(self):
self.assertEqual([BASE64_ALPHABET.index(c) for c in possible_digits64([None, 1, 0, 0, 0, None])], [16, 17, 48, 49])
def test_42(self):
self.assertEqual([BASE64_ALPHABET.index(c) for c in possible_digits64([0, 0, 1, 0, 1, 0, 1, 0])], [42])
def test_None(self):
self.assertEqual(possible_digits64(6*[None]), None)
# Convert string of possible base-64 digits into part of a regular expression.
def make_regexp_component(digits):
if digits is None:
return '.'
elif len(digits) == 1:
if digits[0] == '.':
return r'\.'
else:
return digits[0]
else:
return '[' + digits + ']'
# Convert list of strings of possible base-64 digits into a regular expression.
def make_regexp(dig64_list):
regexp = ''.join(make_regexp_component(ds) for ds in dig64_list)
unknown_beg = len(re.search(r'^\.*', regexp).group(0))
unknown_end = len(re.search(r'\.*$', regexp).group(0))
if unknown_end >= unknown_beg:
regexp = '^' + re.sub(r'\.*$', '', regexp)
else:
regexp = re.sub(r'^\.*', '', regexp) + '$'
return regexp
class TestMakeRegexp(unittest.TestCase):
def test_beg(self):
self.assertEqual(make_regexp([None, 'a', '.', 'CDE.'] + 6*[None]), r'^.a\.[CDE.]')
def test_end(self):
self.assertEqual(make_regexp(5*[None] + ['a', '.', 'CDE.', None, None]), r'a\.[CDE.]..$')
# Generator which, given a pattern to search for in a phonetic tripcode, yields regular expressions to search for the corresponding base64 tripcodes.
def generate_regexps(pattern):
pattern = pattern.lower()
tokens = tokenize(pattern)
syllables, alignments, strict_beg, strict_end = find_alignment(tokens)
for align in alignments:
syllable_choices = find_syllables(syllables, align, strict_beg, strict_end)
for syllable_numbers in itertools.product(*syllable_choices):
known_bits = expand_bits(syllable_numbers)
slices = [known_bits[6*i:6*i+6] for i in range(10)]
dig64_list = [possible_digits64(sl) for sl in slices]
if dig64_list[-1] is not None:
# tripcodes can only end in certain characters; no need to search impossible tripcodes
dig64_list[-1] = ''.join(c for c in dig64_list[-1] if c in TRIPCODE_END_CHARS)
regexp = make_regexp(dig64_list)
yield regexp
class TestGenerateRegexps(unittest.TestCase):
def test_bababa(self):
self.assertEqual(list(generate_regexps('BaBaBa')), ['^AAAA', '[AQgw]AAA[AEIM]$'])
def test_bababa_sep(self):
self.assertEqual(list(generate_regexps('BaBaBa-')), ['^AAAA[ABCD]'])
def test_bababa_break(self):
self.assertEqual(list(generate_regexps(r'BaBaBa\b')), ['^AAAA[ABCD]', '[AEIMQUYcgkosw048]AAAA[AEIM]$'])
def test_ba6(self):
self.assertEqual(list(generate_regexps('^BaBaBa-BaBaBa$')), ['^AAAAAAAAA[AEIM]'])
def test_hubaba(self):
self.assertEqual(list(generate_regexps('hubaba-')), ['^IgAA[ABCD]', '^8wAA[ABCD]'])
def test_babax(self):
self.assertEqual(list(generate_regexps('babax')), ['^AABm', '^AABn', '^AABo', '^AABp', '^AABq', '^AABr', '^AAD5', '^.[AQgw]AA[4567]', '[AQgw]AAZ[gkos]$', '[AQgw]AAZ[w26.]$', '[AQgw]AAa[AEIM]$', '[AQgw]AAa[QUYc]$', '[AQgw]AAa[gkos]$', '[AQgw]AAa[w26.]$', r'[AQgw]AA\.[QUYc]$', '[DHLPTXbfjnrvz37/][ghijklmnopqrstuv][AEIMQUYcgkosw048]AA[AEIM]$'])
# Test whether pattern can be matched.
def is_matchable(pattern):
for regexp in generate_regexps(pattern):
return True
return False
# Convert base-64 encoded strings to phonetic encoding.
def encode_phonetic(base64_string):
# crypt() uses a different base64 alphabet, so character replacement needed
base64_string = base64_string.replace('.', '+')
data = base64.b64decode(base64_string + '==')
output = ''
next_ender = ''
for i, byte in enumerate(data):
if i % 7 == 3 and i < len(data) - 1:
# Every 4 out of 7 bytes, if not last byte, split byte into two parts:
# 1. consonant appended to end of current word
output += LOOKUP_END[byte // 16] + ' '
# 2. consonant to be appended to next word
next_ender = LOOKUP_END[byte % 16]
elif i % 7 == 6 and i < len(data) - 1:
# Every 7 out of 7 bytes, if not last byte, consume next_ender and start new word
output += LOOKUP_MID[byte] + next_ender + ' '
next_ender = ''
else:
# Normal syllable lookup
output += LOOKUP_MID[byte]
# Append next_ender to end of last word if not already consumed
output += next_ender
# Remove initial 'x' from words so that words that start with a vowel can be generated
output = re.sub(r'\bx', '', output)
# Format in title case, words separated by a hyphen
output = output.title().replace(' ', '-')
return output
class TestEncode(unittest.TestCase):
def test_a10(self):
self.assertEqual(encode_phonetic(10*'A'), 'Bababa-Bababa')
def test_a2(self):
self.assertEqual(encode_phonetic(2*'A'), 'Ba')
def test_a3(self):
self.assertEqual(encode_phonetic(3*'A'), 'Baba')
def test_a4(self):
self.assertEqual(encode_phonetic(4*'A'), 'Bababa')
def test_a6(self):
self.assertEqual(encode_phonetic(6*'A'), 'Babababa')
def test_a7(self):
self.assertEqual(encode_phonetic(7*'A'), 'Bababa-Ba')
def test_a8(self):
self.assertEqual(encode_phonetic(8*'A'), 'Bababa-Baba')
def test_slash10(self):
self.assertEqual(encode_phonetic(10*'/'), 'Vauvauvauz-Vauvauvauz')
def test_vowel(self):
self.assertEqual(encode_phonetic('ZgAAAGYAAA'), 'Ababa-Ababa')
def test_different_syl(self):
self.assertEqual(encode_phonetic('AgMEHwUGBw'), 'Bibobub-Bycacez')
def main():
if len(sys.argv) >= 2 and sys.argv[1].lower() == 'forward':
for line in sys.stdin:
match = re.search('[{' + BASE64_ALPHABET + '}]{10}', line)
if match:
print(encode_phonetic(match.group(0)))
elif len(sys.argv) >= 2 and sys.argv[1].lower() == 'filter':
for line in sys.stdin:
line = line.replace('\n', '')
if is_matchable(line):
print(line)
else:
for line in sys.stdin:
line = line.replace('\n', '')
for regexp in generate_regexps(line):
print(regexp)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment