mattwildig/freq_analysis.rb

## freq_analysis.rb
# The relative letter frequencies in English. This is taken from Wikipedia
# (https://en.wikipedia.org/wiki/Letter_frequency) and is based on the Concise
# Oxford dictionary.
ENGLISH_FREQ = [8.167, 1.492, 2.782, 4.253, 12.702, 2.228, 2.015, 6.094, 6.966, 0.153, 0.772, 4.025, 2.406, 6.749, 7.507, 1.929, 0.095, 5.987, 6.327, 9.056, 2.758, 0.978, 2.361, 0.150, 1.974, 0.074]

# Reverse a shift of a single letter by another letter.
# 65 is the ASCII value of `A`, so subtracting 65 from the letter's ord
# value gives a number between 0-25, which we can then do arithmetic
# modulo 26 on.
def unshift(letter, shift_letter)
  letter, shift_letter = letter.ord - 65, shift_letter.ord - 65
  (((letter - shift_letter) % 26) + 65).chr
end

# The Dot product of two vectors (https://en.wikipedia.org/wiki/Dot_product)
# This is used in the calculation of Cosine similarity.
def dot_prod(a, b)
  a.zip(b).map {|a,b| a * b}.inject(:+)
end

# Vector length. The length of a vector. Used in cosine similarity
# calculation. This is just Pythagoras extended to more than two dimensions.
def vec_len(a)
  Math.sqrt(a.map {|a| a ** 2}.inject(:+))
end

# Cosine Similarity (https://en.wikipedia.org/wiki/Cosine_similarity).
# The cosine of the angle between two vectors. The larger this value is
# the closer the two vectors are.
def cos_sim(a, b)
  dot_prod(a,b) / (vec_len(a) * vec_len(b))
end

# Create frequency vector. Counts each letter in text and creates a 26
# element array containing the counts of each letter (element 0 is count of 'A',
# 1 is count of `B` etc.). To be compared with ENGLISH_FREQ.
def create_freq_vec(text)
  text.each_with_object(Array.new(26, 0)) {|c, ary| ary[c.ord - 65] += 1 }
end

# Ruby's `transpose` requires that all the nested arrays are the same size.
# This just pads the text with extra 'A's so that is true. This may affect the
# frequenct analysis a bit, but shouldn't do by too much.
def pad(chars, size)
  extra_chars = size - (chars.length % size)
  return chars if extra_chars == 0
  chars + Array.new(extra_chars, ?A)
end

# Given an array of chars, try each letter A-Z and unshift all chars
# by that letter. Find how close that set is to English using cosine
# similarity and return the shift char that gets the best result.
#
# This won't necessarily give the right answer in all cases, but seems to
# generally quite good.
def guess_key(chars)
  max_score = 0
  char = ??
  (?A..?Z).each do |s|
    shifted = chars.map {|c| unshift(c, s)}
    hist = create_freq_vec(shifted)
    score = cos_sim(hist, ENGLISH_FREQ)
    if score > max_score
      max_score, char = score, s
    end
  end

    char
end

# Decrypts cipher using key.
def decrypt(key, cipher)
  r = []
  pad(cipher.chars, key.length).each_slice(key.chars.length) do |s|
    r.concat key.chars.zip(s).map {|a, b| unshift(b, a)}
  end
  r.join
end


CIPHER_TEXT = "EFMZRNQMWOBEBUIXDDMTRDGAXGJUBKNEIVWATHLHJDZUOAOENVIBROCXCSLZUIFUDCSPHSGMDTQTQAUNVSWTVOAKXUPZVNGATAQJQLRVGSIYROSMWSJUBKGATAITHFNVIIZKEOSMWSJUBKUTHWVIYUQXSHPKBNVHCOBSLRRJJSAZFOLHJFMRVKRPDKBNVSOHDYKUZEFPXHPGAOABDBMBRNVYNCCJBNGIPFBOPUYTGZGRVKRHCWWTFIZLJFMEBUPTCOXVEEPBPHMZUEYHVWAZVCFHUGPOCPVGVOVEFOEMDTXXBDHVTRQYPRRXIZGOASVWTCNGAAYETUMJCRBZGOUSVNTFPBCGY"

KEY_LENGTH = 8

chars = pad(CIPHER_TEXT.chars, KEY_LENGTH)

key = chars.each_slice(KEY_LENGTH).to_a.transpose.map {|s| guess_key(s)}.join

puts "KEY: #{key}"
puts
puts decrypt(key, CIPHER_TEXT)
	# The relative letter frequencies in English. This is taken from Wikipedia
	# (https://en.wikipedia.org/wiki/Letter_frequency) and is based on the Concise
	# Oxford dictionary.
	ENGLISH_FREQ = [8.167, 1.492, 2.782, 4.253, 12.702, 2.228, 2.015, 6.094, 6.966, 0.153, 0.772, 4.025, 2.406, 6.749, 7.507, 1.929, 0.095, 5.987, 6.327, 9.056, 2.758, 0.978, 2.361, 0.150, 1.974, 0.074]

	# Reverse a shift of a single letter by another letter.
	# 65 is the ASCII value of `A`, so subtracting 65 from the letter's ord
	# value gives a number between 0-25, which we can then do arithmetic
	# modulo 26 on.
	def unshift(letter, shift_letter)
	letter, shift_letter = letter.ord - 65, shift_letter.ord - 65
	(((letter - shift_letter) % 26) + 65).chr
	end

	# The Dot product of two vectors (https://en.wikipedia.org/wiki/Dot_product)
	# This is used in the calculation of Cosine similarity.
	def dot_prod(a, b)
	a.zip(b).map {\|a,b\| a * b}.inject(:+)
	end

	# Vector length. The length of a vector. Used in cosine similarity
	# calculation. This is just Pythagoras extended to more than two dimensions.
	def vec_len(a)
	Math.sqrt(a.map {\|a\| a ** 2}.inject(:+))
	end

	# Cosine Similarity (https://en.wikipedia.org/wiki/Cosine_similarity).
	# The cosine of the angle between two vectors. The larger this value is
	# the closer the two vectors are.
	def cos_sim(a, b)
	dot_prod(a,b) / (vec_len(a) * vec_len(b))
	end

	# Create frequency vector. Counts each letter in text and creates a 26
	# element array containing the counts of each letter (element 0 is count of 'A',
	# 1 is count of `B` etc.). To be compared with ENGLISH_FREQ.
	def create_freq_vec(text)
	text.each_with_object(Array.new(26, 0)) {\|c, ary\| ary[c.ord - 65] += 1 }
	end

	# Ruby's `transpose` requires that all the nested arrays are the same size.
	# This just pads the text with extra 'A's so that is true. This may affect the
	# frequenct analysis a bit, but shouldn't do by too much.
	def pad(chars, size)
	extra_chars = size - (chars.length % size)
	return chars if extra_chars == 0
	chars + Array.new(extra_chars, ?A)
	end

	# Given an array of chars, try each letter A-Z and unshift all chars
	# by that letter. Find how close that set is to English using cosine
	# similarity and return the shift char that gets the best result.
	#
	# This won't necessarily give the right answer in all cases, but seems to
	# generally quite good.
	def guess_key(chars)
	max_score = 0
	char = ??
	(?A..?Z).each do \|s\|
	shifted = chars.map {\|c\| unshift(c, s)}
	hist = create_freq_vec(shifted)
	score = cos_sim(hist, ENGLISH_FREQ)
	if score > max_score
	max_score, char = score, s
	end
	end

	char
	end

	# Decrypts cipher using key.
	def decrypt(key, cipher)
	r = []
	pad(cipher.chars, key.length).each_slice(key.chars.length) do \|s\|
	r.concat key.chars.zip(s).map {\|a, b\| unshift(b, a)}
	end
	r.join
	end


	CIPHER_TEXT = "EFMZRNQMWOBEBUIXDDMTRDGAXGJUBKNEIVWATHLHJDZUOAOENVIBROCXCSLZUIFUDCSPHSGMDTQTQAUNVSWTVOAKXUPZVNGATAQJQLRVGSIYROSMWSJUBKGATAITHFNVIIZKEOSMWSJUBKUTHWVIYUQXSHPKBNVHCOBSLRRJJSAZFOLHJFMRVKRPDKBNVSOHDYKUZEFPXHPGAOABDBMBRNVYNCCJBNGIPFBOPUYTGZGRVKRHCWWTFIZLJFMEBUPTCOXVEEPBPHMZUEYHVWAZVCFHUGPOCPVGVOVEFOEMDTXXBDHVTRQYPRRXIZGOASVWTCNGAAYETUMJCRBZGOUSVNTFPBCGY"

	KEY_LENGTH = 8

	chars = pad(CIPHER_TEXT.chars, KEY_LENGTH)

	key = chars.each_slice(KEY_LENGTH).to_a.transpose.map {\|s\| guess_key(s)}.join

	puts "KEY: #{key}"
	puts
	puts decrypt(key, CIPHER_TEXT)