Skip to content

Instantly share code, notes, and snippets.

@hikari-no-yume
Last active August 18, 2021 00:46
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hikari-no-yume/7362f2b1d70e3d1250b07b09451f1c76 to your computer and use it in GitHub Desktop.
Save hikari-no-yume/7362f2b1d70e3d1250b07b09451f1c76 to your computer and use it in GitHub Desktop.
bruteforce mojibake decoder
# as of python 3.9
all_encodings = ['ascii', 'big5', 'big5hkscs', 'cp037', 'cp273', 'cp424', 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855', 'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863', 'cp864', 'cp865', 'cp866', 'cp869', 'cp874', 'cp875', 'cp932', 'cp949', 'cp950', 'cp1006', 'cp1026', 'cp1125', 'cp1140', 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 'cp1256', 'cp1257', 'cp1258', 'euc_jp', 'euc_jis_2004', 'euc_jisx0213', 'euc_kr', 'gb2312', 'gbk', 'gb18030', 'hz', 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004', 'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr', 'latin_1', 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6', 'iso8859_7', 'iso8859_8', 'iso8859_9', 'iso8859_10', 'iso8859_11', 'iso8859_13', 'iso8859_14', 'iso8859_15', 'iso8859_16', 'johab', 'koi8_r', 'koi8_t', 'koi8_u', 'kz1048', 'mac_cyrillic', 'mac_greek', 'mac_iceland', 'mac_latin2', 'mac_roman', 'mac_turkish', 'ptcp154', 'shift_jis', 'shift_jis_2004', 'shift_jisx0213', 'utf_32', 'utf_32_be', 'utf_32_le', 'utf_16', 'utf_16_be', 'utf_16_le', 'utf_7', 'utf_8', 'utf_8_sig']
start = 'åÊ'
def insert(d, k, set_elem):
if k in d:
d[k].add(set_elem)
else:
d[k] = {set_elem}
stage1_encodings = dict()
stage2_encodings = dict()
stage3_encodings = dict()
stage4_encodings = set()
for encoding1 in all_encodings:
try:
stage1 = start.encode(encoding1)
except:
continue
insert(stage1_encodings, stage1, encoding1)
for encoding2 in all_encodings:
try:
stage2 = stage1.decode(encoding2)
except:
continue
insert(stage2_encodings, (stage1, stage2), encoding2)
for encoding3 in all_encodings:
try:
stage3 = stage2.encode(encoding3)
except:
continue
insert(stage3_encodings, (stage2, stage3), encoding3)
try:
stage4 = stage3.decode('utf-8')
except:
continue
if len(stage4) != 1:
continue
stage4_encodings.add((stage3, stage4))
for stage3, result in stage4_encodings:
print(f"{result!r}")
print(f"|- UTF-8")
print(f" |- {stage3!r}")
for stage2_3, encoding3 in stage3_encodings.items():
if stage2_3[1] == stage3:
stage2 = stage2_3[0]
print(f" |- {', '.join(encoding3)}")
print(f" |- {stage2!r}")
for stage1_2, encoding2 in stage2_encodings.items():
if stage1_2[1] == stage2:
stage1 = stage1_2[0]
print(f" |- {', '.join(encoding2)}")
print(f" |- {stage1!r}")
encoding1 = stage1_encodings[stage1]
print(f" |- {', '.join(encoding1)}")
print(f" |- {start!r}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment