Skip to content

Instantly share code, notes, and snippets.

@sudoaza
Created July 21, 2023 08:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sudoaza/ecd68d778a7c8cb5dac4a97b98e21c0f to your computer and use it in GitHub Desktop.
Save sudoaza/ecd68d778a7c8cb5dac4a97b98e21c0f to your computer and use it in GitHub Desktop.
Substitution cipher analysis
import math
from nltk.util import ngrams
ciphertext = """['|']][]]}[[/]] [{}]{[]}['|'], ]]]]])([](_) {[9}])([['|'] ]]/[[[{}]{[]}['|'] ][['|'] ['|']{[]}]]([]{[[/]] ['|']])([ [{=][{[9}](_)]][}[]{ ])([](_)['|'] ['|'][{}][]{ [}/[}])([[[/]]['|'] [{}]{[]}]][}[[))[]{[}{][]{[[)) [{][]]}[{}][]{]][}[[/]]. [{[{}][]{[]{]][}[[/]], ]]]]])([](_)]][}[[/]] ]{]0]{]]][}00['|'] """
charset = list(set(ciphertext))
occurrences = {s: ciphertext.count(s) for s in charset}
print(sorted(occurrences.items(), key=lambda x: -x[1]))
used_symbols = [c for c in charset if c not in " ,.0"]
print(used_symbols)
symbols_count = len([s for s in ciphertext if s in used_symbols])
print(symbols_count)
occurrence_rate = {s: float(occurrences[s]) / symbols_count for s in used_symbols }
print(sorted(occurrence_rate.items(), key=lambda x: -x[1]))
ciphertext = ciphertext.replace("['|']","T")
ngram_likelihood = {}
for n in range(2,6):
for ngram in ngrams(list(ciphertext), n):
if not all(s in used_symbols for s in ngram):
continue
str_ngram = "".join(ngram)
expected_occurrence_rate = math.prod([occurrence_rate[s] for s in str_ngram])
actual_occurrence_rate = ciphertext.count( str_ngram ) / len(used_symbols) ** n
if ciphertext.count( str_ngram ) > 1:
ngram_likelihood[str_ngram] = actual_occurrence_rate / expected_occurrence_rate
print(sorted(ngram_likelihood.items(), key=lambda x: -x[1]))
print(ciphertext)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment