Skip to content

Instantly share code, notes, and snippets.

@newtonmwai
Forked from hrickards/crack.py
Created April 2, 2019 16:47
Show Gist options
  • Save newtonmwai/3f7a6c3d620babcf2f7e605d7cc868d8 to your computer and use it in GitHub Desktop.
Save newtonmwai/3f7a6c3d620babcf2f7e605d7cc868d8 to your computer and use it in GitHub Desktop.
Files for cracking substitution ciphers.
#!/usr/bin/pypy
# Change the above #! if you want to use python (slower) rather than pypy
import random, re, sys, math
from segment import segment
# Usage: ./crack.py ciphertext
# where ciphertext is (you guessed it) a plaintext file containing the
# ciphertext
# Currently uses a hill-climbing algorithm, which performs surprisingly well.
# TODO In future, simulated annealing is supposed to work a lot better
# Code based upon (basically a fork of)
# http://practicalcryptography.com/media/cryptanalysis/files/ngram_score_1.py
ALPHABET = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
ASCII_OFFSET = ord('A')
MAX_ITERATIONS = 1000
# Read ciphertext using filename from cli argument
ciphertext = open(sys.argv[1]).read()
# Bigrams list, trigram, etc
class NGrams(object):
# Load and parse ngrams list
# File format should be
# aard 5
# etc
def __init__(self, filename):
self.ngrams = {}
for line in open(filename):
char, count = line.split(" ")
self.ngrams[char] = int(count)
# The n in ngram
self.L = len(char)
# Total number of occurences
self.N = sum(self.ngrams.itervalues())
# Turn count into a probability, then log it. Done to avoid rounding
# errors with small floating point numbers.
for char in self.ngrams.keys():
self.ngrams[char] = math.log10(float(self.ngrams[char])/self.N)
# Don't want prob to be -infty
self.floor = math.log10(0.01/self.N)
# Score a given text by comparing it's ngrams
def score_text(self, text):
score = 0
for i in range(len(text)-self.L+1):
# For each segment of the same length as the ngrams
# Add it's probability if it's in the list of ngrams
# Otherwise add the lowest prob
if text[i:i+self.L] in self.ngrams:
score += self.ngrams[text[i:i+self.L]]
else: score += self.floor
return score
# Given a ciphertext and possible key, decipher the text to return a possible
# plaintext
def decipher(text, key):
# Find an inverse to the key. e.g. A->B, C->D goes to B->A, D->C
inverse = [ALPHABET[key.index(i)%26] for i in ALPHABET]
plaintext = ''
for char in text:
# Only want to decode letters
# ord(char.upper())-ASCII_OFFSET just gives us 0..25 value of letter
if char.isalpha(): plaintext += inverse[ord(char.upper())-ASCII_OFFSET]
else: plaintext += char
return plaintext
# For the moment, just use quadgrams to score as that seems to work really well
# TODO Would adding in lower-n grams, or just using quintgrams give better
# performance (at a guess, quintgrams would, but be much slower)?
quadgrams = NGrams("quadgrams.txt")
def score_key(ciphertext, key):
# Decipher and remove anything that's not a letter
plaintext = decipher(ciphertext, key)
return quadgrams.score_text(re.sub('[^A-Z]', '', plaintext.upper()))
# Ready a plaintext for output
def format_plaintext(plaintext):
# If there are no spaces in the plaintext, segment it into words using
# j2kun's library
# Not 100% accurate at word segmentation, but a lot better than no
# segmentation at all
if not (" " in plaintext):
words = []
for section in plaintext.upper().split("AND"):
words.append(" ".join(segment(section)))
plaintext = " and ".join(words)
return plaintext
# Run the whole decryption process multiple times, as it's a non-deterministic
# process
best_key = None
best_score = -99e99
while 1:
# Start with an initial key and score it
key = list(ALPHABET)
random.shuffle(key)
score = score_key(ciphertext, key)
# Only change the key a 1000 times with no score improvement before stopping
count = 0
while count < 1000:
# Create a new key by randomly swapping two positions
a = random.randint(0,25)
b = random.randint(0,25)
new_key = key[:]
new_key[a], new_key[b] = new_key[b], new_key[a]
# Score the new key
new_score = score_key(ciphertext, new_key)
# If the new key was better, replace the old one with it
if new_score > score:
score, key = new_score, new_key[:]
count = 0 # Restart the counter for detecting no score improvement
count += 1
# Only use this key if it was better than the last algo iteration
if score > best_score:
best_score = score
best_key = key
print "====================================================="
print format_plaintext(decipher(ciphertext, best_key))
print "====================================================="
Download from http://norvig.com/ngrams/count_1w.txt
Download from http://practicalcryptography.com/media/cryptanalysis/files/english_quadgrams.txt.zip
# From https://github.com/j2kun/substitution
#!/usr/bin/python
import functools, math
class OneGramDist(dict):
def __init__(self, filename):
self.gramCount = 0
for line in open(filename):
(word, count) = line[:-1].split('\t')
self[word] = int(count)
self.gramCount += self[word]
def __call__(self, key):
if key in self:
return float(self[key]) / self.gramCount
else:
return 1.0 / (self.gramCount * 10**(len(key)-2))
singleWordProb = OneGramDist('one-grams.txt')
def wordSeqFitness(words):
return sum(math.log10(singleWordProb(w)) for w in words)
def memoize(f):
cache = {}
def memoizedFunction(*args):
if args not in cache:
cache[args] = f(*args)
return cache[args]
memoizedFunction.cache = cache
return memoizedFunction
@memoize
def segment(word):
if not word: return []
word = word.lower() # change to lower case
allSegmentations = [[first] + segment(rest) for (first,rest) in splitPairs(word)]
return max(allSegmentations, key = wordSeqFitness)
def splitPairs(word, maxLen=20):
return [(word[:i+1], word[i+1:]) for i in range(max(len(word), maxLen))]
@memoize
def segmentWithProb(word):
segmented = segment(word)
return (wordSeqFitness(segmented), segmented)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment