Last active
September 22, 2020 17:10
-
-
Save ccd0/bff899edcba8b6a876c930da65acc5a0 to your computer and use it in GitHub Desktop.
Convert a search of phonetic tripcodes to a search of base64 tripcodes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# This program reads patterns to search for in a phonetic tripcode. | |
# It converts them into a list of regular expressions for matching | |
# the corresponding base64 tripcodes, which can be fed to | |
# tripcode engines such as Merikens to search for tripcode passwords. | |
# | |
# Search patterns are read on standard input, one per line, and the | |
# regular expressions are printed to standard output, one per line. | |
# | |
# Patterns that cannot be matched result in no output. | |
# | |
# The symbols ^, $, and \b are supported and have the same meaning | |
# as in regular expressions: | |
# ^ matches the beginning of the tripcode | |
# $ matches the end of the tripcode | |
# \b matches a word break | |
# | |
# For search purposes any leading ! is not considered part of the tripcode. | |
# | |
# If given the option 'forward' on the command line, the program will | |
# instead read the first item on each line that is formatted like | |
# a base-64 tripcode, and print the tripcode in phonetic encoding. | |
# This is useful for finding out what the tripcodes chosen by | |
# a tripcode engine will be displayed as. | |
# | |
# If given the option 'filter' on the command line, the program will | |
# print only the input patterns that can be matched. | |
# ----------------------------------------------------------------------------- | |
# Copyright (c) 2020 contributors | |
# | |
# Permission to use, copy, modify, and/or distribute this software for any | |
# purpose with or without fee is hereby granted. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH | |
# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY | |
# AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, | |
# INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM | |
# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR | |
# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR | |
# PERFORMANCE OF THIS SOFTWARE. | |
# ----------------------------------------------------------------------------- | |
import re, itertools, base64, sys, unittest | |
CONSONANTS = 'bcdfghjklmnpqrstvwxz' | |
VOWELS = 'aeiouy' | |
BASE64_ALPHABET = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789./' | |
TRIPCODE_END_CHARS = '.26AEIMQUYcgkosw' | |
WORD_BOUNDARY_TOKENS = ['^', '$', '-', r'\b'] | |
LOOKUP_MID = [c + v for c in CONSONANTS.replace('q', '') for v in VOWELS] + [ | |
'cha', 'che', 'sha', 'the', 'sta', 'ste', 'tri', 'lee', 'bra', 'lia', 'gla', 'lea', 'dia', 'gai', | |
'mia', 'ree', 'phi', 'die', 'lei', 'sie', 'lai', 'dwi', 'sca', 'pri', 'tha', 'rey', 'roy', 'kay', 'que', 'cia', | |
'cie', 'bri', 'rya', 'sco', 'lio', 'sto', 'nia', 'tie', 'she', 'dua', 'ria', 'tho', 'tia', 'nya', 'spe', 'ley', | |
'vio', 'qua', 'loi', 'qui', 'bia', 'sey', 'chi', 'lay', 'nee', 'dra', 'via', 'teo', 'fre', 'kie', 'kia', 'saa', | |
'sea', 'sia', 'cly', 'rai', 'sue', 'mee', 'mya', 'nie', 'shi', 'wre', 'sho', 'rio', 'dee', 'vie', 'dwa', 'cla', | |
'tra', 'ble', 'kai', 'lie', 'lya', 'ney', 'sla', 'blo', 'nai', 'way', 'smi', 'kae', 'kee', 'phe', 'rae', 'jua', | |
'nea', 'mai', 'zia', 'rye', 'kei', 'lle', 'nei', 'mie', 'cho', 'tya', 'nay', 'nye', 'rea', 'pha', 'deo', 'cle', | |
'tzi', 'lou', 'dre', 'khi', 'llu', 'fae', 'mou', 'cre', 'loy', 'sai', 'mae', 'rie', 'tre', 'cae', 'nyo', 'leo', | |
'ceo', 'ray', 'maa', 'shu', 'veo', 'naa', 'lau', 'gra', 'sti', 'xia', 'dio', 'gro', 'thi', 'gio', 'see', 'vau' | |
] | |
LOOKUP_BEG = [s.replace('x', '') for s in LOOKUP_MID] | |
LOOKUP_END = [''] + list('bcdfghklmnrstxz') | |
LOOKUP = 2 * [LOOKUP_BEG, LOOKUP_MID, LOOKUP_MID, LOOKUP_END] | |
class TestConstants(unittest.TestCase): | |
def test_letters(self): | |
self.assertEqual(len(CONSONANTS + VOWELS), 26) | |
def test_base64(self): | |
self.assertEqual(len(BASE64_ALPHABET), 64) | |
def test_lookup(self): | |
self.assertEqual([len(choices) for choices in LOOKUP], [2**n for n in [8,8,8,4,8,8,8,4]]) | |
# Break up search pattern into syllables and special tokens ^ $ - \b. | |
# Invalid characters are passed through as tokens; they will cause a match failure later in find_syllables. | |
def tokenize(pattern): | |
return re.findall(r'[\^$-]|\\b|[{0}]+[{1}]*|[{1}]+|.'.format(CONSONANTS, VOWELS), pattern, flags=re.DOTALL) | |
class TestTokenize(unittest.TestCase): | |
def test_tokenize(self): | |
self.assertEqual(tokenize(r'^agramaax-\btrevi*&la$'), ['^', 'a', 'gra', 'maa', 'x', '-', r'\b', 'tre', 'vi', '*', '&', 'la', '$']) | |
# Process special tokens ^ $ - \b. Determine: | |
# syllables: list of syllables to match with special tokens filtered out, including possibly empty final consonants | |
# alignments: list of possible starting positions for the syllables to match | |
# strict_beg: if first syllable to match must match the beginning of a syllable | |
# strict_end: if last syllable to match must match the end of a syllable | |
def find_alignment(tokens): | |
syllables = [] | |
alignments = list(range(9)) | |
strict_beg = len(tokens) > 0 and tokens[0] in WORD_BOUNDARY_TOKENS | |
strict_end = len(tokens) > 0 and tokens[-1] in WORD_BOUNDARY_TOKENS | |
for i, token in enumerate(tokens): | |
n = len(syllables) | |
# insert empty final consonant if we hit a word boundary after a syllable ending in a vowel | |
if token in WORD_BOUNDARY_TOKENS and i > 0 and tokens[i-1][-1] in VOWELS: | |
alignments = [a for a in alignments if a + n == 3 or a + n == 7] | |
syllables.append('') | |
n += 1 | |
if token == '^': | |
alignments = [a for a in alignments if a + n == 0] | |
elif token == '$': | |
alignments = [a for a in alignments if a + n == 8] | |
elif token == '-': | |
alignments = [a for a in alignments if a + n == 4] | |
elif token == r'\b': | |
alignments = [a for a in alignments if a + n in [0, 4, 8]] | |
else: | |
if token[-1] in VOWELS: | |
alignments = [a for a in alignments if 0 <= a + n <= 2 or 4 <= a + n <= 6] | |
else: | |
alignments = [a for a in alignments if 0 <= a + n <= 7] | |
syllables.append(token) | |
return syllables, alignments, strict_beg, strict_end | |
class TestAlignment(unittest.TestCase): | |
def test_syl1(self): | |
self.assertEqual(find_alignment(tokenize('ba')), (['ba'], [0, 1, 2, 4, 5, 6], False, False)) | |
def test_syl2(self): | |
self.assertEqual(find_alignment(tokenize('baba')), (2*['ba'], [0, 1, 4, 5], False, False)) | |
def test_syl3(self): | |
self.assertEqual(find_alignment(tokenize('bababa')), (3*['ba'], [0, 4], False, False)) | |
def test_syl4(self): | |
self.assertEqual(find_alignment(tokenize('babababa')), (4*['ba'], [], False, False)) | |
def test_lone_vowel(self): | |
self.assertEqual(find_alignment(tokenize('a')), (['a'], [0, 1, 2, 4, 5, 6], False, False)) | |
def test_lone_cons(self): | |
self.assertEqual(find_alignment(tokenize('b')), (['b'], [0, 1, 2, 3, 4, 5, 6, 7], False, False)) | |
def test_final_cons(self): | |
self.assertEqual(find_alignment(tokenize('bab')), (['ba', 'b'], [0, 1, 2, 4, 5, 6], False, False)) | |
def test_start(self): | |
self.assertEqual(find_alignment(tokenize('^ba')), (['ba'], [0], True, False)) | |
def test_end_vowel(self): | |
self.assertEqual(find_alignment(tokenize('ba$')), (['ba', ''], [6], False, True)) | |
def test_end_cons(self): | |
self.assertEqual(find_alignment(tokenize('bab$')), (['ba', 'b'], [6], False, True)) | |
def test_whole(self): | |
self.assertEqual(find_alignment(tokenize('^bababa-bababa$')), (2*['ba', 'ba', 'ba', ''], [0], True, True)) | |
def test_separator_pre_vowel(self): | |
self.assertEqual(find_alignment(tokenize('-a')), (['a'], [4], True, False)) | |
def test_separator_pre_cons(self): | |
self.assertEqual(find_alignment(tokenize('-b')), (['b'], [4], True, False)) | |
def test_separator_post_vowel(self): | |
self.assertEqual(find_alignment(tokenize('a-')), (['a', ''], [2], False, True)) | |
def test_separator_post_cons(self): | |
self.assertEqual(find_alignment(tokenize('b-')), (['b'], [3], False, True)) | |
def test_break_pre_vowel(self): | |
self.assertEqual(find_alignment(tokenize(r'\ba')), (['a'], [0, 4], True, False)) | |
def test_break_pre_cons(self): | |
self.assertEqual(find_alignment(tokenize(r'\bb')), (['b'], [0, 4], True, False)) | |
def test_break_post_vowel(self): | |
self.assertEqual(find_alignment(tokenize(r'a\b')), (['a', ''], [2, 6], False, True)) | |
def test_break_post_cons(self): | |
self.assertEqual(find_alignment(tokenize(r'b\b')), (['b'], [3, 7], False, True)) | |
def test_conflict(self): | |
self.assertEqual(find_alignment(tokenize(r'^baba-')), (['ba', 'ba', ''], [], True, True)) | |
def test_syllable(haystack, needle, must_beg, must_end): | |
if must_beg: | |
if must_end: | |
return haystack == needle | |
else: | |
return haystack.startswith(needle) | |
else: | |
if must_end: | |
return haystack.endswith(needle) | |
else: | |
return needle in haystack | |
# For a given starting position of the syllables to match, find the list of possible syllables at each position in the tripcode. | |
# [None] indicates no restriction on the possible syllables at that position. | |
def find_syllables(syllables, align, strict_beg, strict_end): | |
syllable_choices = 8*[[None]] | |
for i, syl in enumerate(syllables): | |
j = i + align | |
must_beg = strict_beg or i > 0 | |
must_end = strict_end or i < len(syllables) - 1 | |
syllable_choices[j] = [n for n, syl2 in enumerate(LOOKUP[j]) if test_syllable(syl2, syl, must_beg, must_end)] | |
return syllable_choices | |
class TestFindSyllables(unittest.TestCase): | |
def test_rere(self): | |
self.assertEqual(find_syllables(['re', 're'], 1, False, False), [[None], [LOOKUP_MID.index(s) for s in ['re', 'fre', 'wre', 'dre', 'cre', 'tre']], [LOOKUP_MID.index(s) for s in ['re', 'ree', 'rey', 'rea']]] + 5*[[None]]) | |
def test_re(self): | |
self.assertEqual(find_syllables(['re'], 1, False, False), [[None], [LOOKUP_MID.index(s) for s in ['re', 'ree', 'rey', 'fre', 'wre', 'rea', 'dre', 'cre', 'tre']]] + 6*[[None]]) | |
def test_re_beg(self): | |
self.assertEqual(find_syllables(['re'], 1, True, False), [[None], [LOOKUP_MID.index(s) for s in ['re', 'ree', 'rey', 'rea']]] + 6*[[None]]) | |
def test_re_end(self): | |
self.assertEqual(find_syllables(['re'], 1, False, True), [[None], [LOOKUP_MID.index(s) for s in ['re', 'fre', 'wre', 'dre', 'cre', 'tre']]] + 6*[[None]]) | |
def test_re_exact(self): | |
self.assertEqual(find_syllables(['re'], 1, True, True), [[None], [LOOKUP_MID.index('re')]] + 6*[[None]]) | |
def test_x(self): | |
self.assertEqual(find_syllables(['x'], 1, False, False), [[None], [LOOKUP_MID.index(s) for s in LOOKUP_MID if 'x' in s]] + 6*[[None]]) | |
def test_x_wordbeg(self): | |
self.assertEqual(find_syllables(['x'], 0, False, False), [[]] + 7*[[None]]) | |
def test_x_wordend(self): | |
self.assertEqual(find_syllables(['x'], 3, False, False), [[None], [None], [None], [LOOKUP_END.index('x')]] + 4*[[None]]) | |
def test_e(self): | |
self.assertEqual(find_syllables(['e'], 1, False, False), [[None], [LOOKUP_MID.index(s) for s in LOOKUP_MID if 'e' in s]] + 6*[[None]]) | |
def test_e_wordend(self): | |
self.assertEqual(find_syllables(['e'], 3, False, False), [[None], [None], [None], []] + 4*[[None]]) | |
def test_invalid(self): | |
self.assertEqual(find_syllables(['*'], 1, False, False), [[None], []] + 6*[[None]]) | |
def byte_to_bits(n): | |
return [(n >> (7-i)) & 1 for i in range(8)] | |
def bits_to_number(bits): | |
n = len(bits) | |
return sum(b << (n-1-i) for i, b in enumerate(bits)) | |
class TestBits(unittest.TestCase): | |
def test_to_bits(self): | |
self.assertEqual(byte_to_bits(42), [0, 0, 1, 0, 1, 0, 1, 0]) | |
def test_to_number(self): | |
self.assertEqual(bits_to_number([0, 0, 1, 0, 1, 0, 1, 0]), 42) | |
def test_roundtrip(self): | |
for n in range(256): | |
self.assertEqual(bits_to_number(byte_to_bits(n)), n) | |
# Expand a list of syllable numbers to a list of bits in the tripcode. | |
# None indicates no restrictions on the syllable / bit at that position. | |
def expand_bits(syllable_numbers): | |
# final consonants (syllable_numbers[3] and syllable_numbers[7]) are created by splitting a byte up; move them back together | |
numbers_original_order = syllable_numbers[0:4] + syllable_numbers[7:8] + syllable_numbers[4:7] | |
known_bits = [] | |
for i, n in enumerate(numbers_original_order): | |
bits = 8*[None] if n is None else byte_to_bits(n) | |
if i == 3 or i == 4: | |
# final consonants are only 4 bits each | |
bits = bits[4:8] | |
known_bits += bits | |
# add 4 final bits to the 56 bits represented by the syllables to make a 10-digit base-64 string (60 bits) | |
known_bits += 4*[None] | |
return known_bits | |
class TestExpandBits(unittest.TestCase): | |
def test_expand(self): | |
self.assertEqual(expand_bits([None, 42, 42] + 5*[None]), 8*[None] + 2*[0, 0, 1, 0, 1, 0, 1, 0] + 36*[None]) | |
def test_length(self): | |
self.assertEqual(len(expand_bits([None, 42, 42] + 5*[None])), 60) | |
# Convert list of known bits (None representing unknown) into a string of possible base-64 digits, or None if any digit is possible. | |
def possible_digits64(bit_slice): | |
if all(b is None for b in bit_slice): | |
return None | |
else: | |
bit_choices = [[0, 1] if b is None else [b] for b in bit_slice] | |
return ''.join(BASE64_ALPHABET[bits_to_number(bs)] for bs in itertools.product(*bit_choices)) | |
class TestPossibleDigits64(unittest.TestCase): | |
def test_mixed(self): | |
self.assertEqual([BASE64_ALPHABET.index(c) for c in possible_digits64([None, 1, 0, 0, 0, None])], [16, 17, 48, 49]) | |
def test_42(self): | |
self.assertEqual([BASE64_ALPHABET.index(c) for c in possible_digits64([0, 0, 1, 0, 1, 0, 1, 0])], [42]) | |
def test_None(self): | |
self.assertEqual(possible_digits64(6*[None]), None) | |
# Convert string of possible base-64 digits into part of a regular expression. | |
def make_regexp_component(digits): | |
if digits is None: | |
return '.' | |
elif len(digits) == 1: | |
if digits[0] == '.': | |
return r'\.' | |
else: | |
return digits[0] | |
else: | |
return '[' + digits + ']' | |
# Convert list of strings of possible base-64 digits into a regular expression. | |
def make_regexp(dig64_list): | |
regexp = ''.join(make_regexp_component(ds) for ds in dig64_list) | |
unknown_beg = len(re.search(r'^\.*', regexp).group(0)) | |
unknown_end = len(re.search(r'\.*$', regexp).group(0)) | |
if unknown_end >= unknown_beg: | |
regexp = '^' + re.sub(r'\.*$', '', regexp) | |
else: | |
regexp = re.sub(r'^\.*', '', regexp) + '$' | |
return regexp | |
class TestMakeRegexp(unittest.TestCase): | |
def test_beg(self): | |
self.assertEqual(make_regexp([None, 'a', '.', 'CDE.'] + 6*[None]), r'^.a\.[CDE.]') | |
def test_end(self): | |
self.assertEqual(make_regexp(5*[None] + ['a', '.', 'CDE.', None, None]), r'a\.[CDE.]..$') | |
# Generator which, given a pattern to search for in a phonetic tripcode, yields regular expressions to search for the corresponding base64 tripcodes. | |
def generate_regexps(pattern): | |
pattern = pattern.lower() | |
tokens = tokenize(pattern) | |
syllables, alignments, strict_beg, strict_end = find_alignment(tokens) | |
for align in alignments: | |
syllable_choices = find_syllables(syllables, align, strict_beg, strict_end) | |
for syllable_numbers in itertools.product(*syllable_choices): | |
known_bits = expand_bits(syllable_numbers) | |
slices = [known_bits[6*i:6*i+6] for i in range(10)] | |
dig64_list = [possible_digits64(sl) for sl in slices] | |
if dig64_list[-1] is not None: | |
# tripcodes can only end in certain characters; no need to search impossible tripcodes | |
dig64_list[-1] = ''.join(c for c in dig64_list[-1] if c in TRIPCODE_END_CHARS) | |
regexp = make_regexp(dig64_list) | |
yield regexp | |
class TestGenerateRegexps(unittest.TestCase): | |
def test_bababa(self): | |
self.assertEqual(list(generate_regexps('BaBaBa')), ['^AAAA', '[AQgw]AAA[AEIM]$']) | |
def test_bababa_sep(self): | |
self.assertEqual(list(generate_regexps('BaBaBa-')), ['^AAAA[ABCD]']) | |
def test_bababa_break(self): | |
self.assertEqual(list(generate_regexps(r'BaBaBa\b')), ['^AAAA[ABCD]', '[AEIMQUYcgkosw048]AAAA[AEIM]$']) | |
def test_ba6(self): | |
self.assertEqual(list(generate_regexps('^BaBaBa-BaBaBa$')), ['^AAAAAAAAA[AEIM]']) | |
def test_hubaba(self): | |
self.assertEqual(list(generate_regexps('hubaba-')), ['^IgAA[ABCD]', '^8wAA[ABCD]']) | |
def test_babax(self): | |
self.assertEqual(list(generate_regexps('babax')), ['^AABm', '^AABn', '^AABo', '^AABp', '^AABq', '^AABr', '^AAD5', '^.[AQgw]AA[4567]', '[AQgw]AAZ[gkos]$', '[AQgw]AAZ[w26.]$', '[AQgw]AAa[AEIM]$', '[AQgw]AAa[QUYc]$', '[AQgw]AAa[gkos]$', '[AQgw]AAa[w26.]$', r'[AQgw]AA\.[QUYc]$', '[DHLPTXbfjnrvz37/][ghijklmnopqrstuv][AEIMQUYcgkosw048]AA[AEIM]$']) | |
# Test whether pattern can be matched. | |
def is_matchable(pattern): | |
for regexp in generate_regexps(pattern): | |
return True | |
return False | |
# Convert base-64 encoded strings to phonetic encoding. | |
def encode_phonetic(base64_string): | |
# crypt() uses a different base64 alphabet, so character replacement needed | |
base64_string = base64_string.replace('.', '+') | |
data = base64.b64decode(base64_string + '==') | |
output = '' | |
next_ender = '' | |
for i, byte in enumerate(data): | |
if i % 7 == 3 and i < len(data) - 1: | |
# Every 4 out of 7 bytes, if not last byte, split byte into two parts: | |
# 1. consonant appended to end of current word | |
output += LOOKUP_END[byte // 16] + ' ' | |
# 2. consonant to be appended to next word | |
next_ender = LOOKUP_END[byte % 16] | |
elif i % 7 == 6 and i < len(data) - 1: | |
# Every 7 out of 7 bytes, if not last byte, consume next_ender and start new word | |
output += LOOKUP_MID[byte] + next_ender + ' ' | |
next_ender = '' | |
else: | |
# Normal syllable lookup | |
output += LOOKUP_MID[byte] | |
# Append next_ender to end of last word if not already consumed | |
output += next_ender | |
# Remove initial 'x' from words so that words that start with a vowel can be generated | |
output = re.sub(r'\bx', '', output) | |
# Format in title case, words separated by a hyphen | |
output = output.title().replace(' ', '-') | |
return output | |
class TestEncode(unittest.TestCase): | |
def test_a10(self): | |
self.assertEqual(encode_phonetic(10*'A'), 'Bababa-Bababa') | |
def test_a2(self): | |
self.assertEqual(encode_phonetic(2*'A'), 'Ba') | |
def test_a3(self): | |
self.assertEqual(encode_phonetic(3*'A'), 'Baba') | |
def test_a4(self): | |
self.assertEqual(encode_phonetic(4*'A'), 'Bababa') | |
def test_a6(self): | |
self.assertEqual(encode_phonetic(6*'A'), 'Babababa') | |
def test_a7(self): | |
self.assertEqual(encode_phonetic(7*'A'), 'Bababa-Ba') | |
def test_a8(self): | |
self.assertEqual(encode_phonetic(8*'A'), 'Bababa-Baba') | |
def test_slash10(self): | |
self.assertEqual(encode_phonetic(10*'/'), 'Vauvauvauz-Vauvauvauz') | |
def test_vowel(self): | |
self.assertEqual(encode_phonetic('ZgAAAGYAAA'), 'Ababa-Ababa') | |
def test_different_syl(self): | |
self.assertEqual(encode_phonetic('AgMEHwUGBw'), 'Bibobub-Bycacez') | |
def main(): | |
if len(sys.argv) >= 2 and sys.argv[1].lower() == 'forward': | |
for line in sys.stdin: | |
match = re.search('[{' + BASE64_ALPHABET + '}]{10}', line) | |
if match: | |
print(encode_phonetic(match.group(0))) | |
elif len(sys.argv) >= 2 and sys.argv[1].lower() == 'filter': | |
for line in sys.stdin: | |
line = line.replace('\n', '') | |
if is_matchable(line): | |
print(line) | |
else: | |
for line in sys.stdin: | |
line = line.replace('\n', '') | |
for regexp in generate_regexps(line): | |
print(regexp) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment