axiomsofchoice/cipher.py

## cipher.py
#!/usr/bin/evn python
"""
Cryptanalysis of "Cipher" puzzle, Stage 5, Director GCHQ's Christmas Puzzle 2015
================================================================================

http://www.gchq.gov.uk/press_and_media/news_and_features/Pages/Director%27s-Christmas-puzzle-2015.aspx
Puzzle Crown Copyright.

SPOILER ALERT - If you're attempting the puzzle and you don't want to find out
any hints do not read ahead!

Below is an annoted account of how to cryptanalyse this cipher. Major clues all come from reddit threads (URLs cited) as my own efforts came to nothing.

-----------

The MIT License (MIT)
Copyright (c) 2015-2016 Dan Hagon.

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""

import re
import math
import itertools

cipher =   "21141252464586788990912124244364386588508520541651662"\
           "6828040042142252474595796718838050062084185386586696"
print cipher

##################
# Initial ideas. #
##################

# Gives 105.
print "Length of cipher", len(cipher)
print

def dofreqcounts(mylist):
    # Count frequencies and print in order.
    freq_dist = dict()
    for i in range(len(mylist)):
        if not mylist[i] in freq_dist:
            freq_dist[mylist[i]] = 1
        else:
            freq_dist[mylist[i]] += 1
    #print freq_dist
    my_sorted = sorted(freq_dist, key=freq_dist.__getitem__)
    my_sorted.reverse()
    for m in my_sorted:
        print "%2d:" % int(m), freq_dist[m]
    return freq_dist

dofreqcounts(cipher)
print

"""
Gives:
 8: 16
 6: 14
 4: 14
 2: 14
 5: 13
 0: 10
 1: 10
 9: 6
 7: 4
 3: 4

Chance of getting an even number is about 64.7%

---

Interesting that the prime factors of 105 are:
>>> (3)*(5)*(7)
105
possibly suggesting plaintext lengths of:
15*7, 21*5, 35*3

3 digits per plaintext character would be consistent with using an ASCII decimal encoding.

We have to assume the plaintext alphabet is {A-Z}* only since this much is hinted at in the problem description.

The Hill cipher is not a possibility with sizes 3*3, 5*5 or 7*7 as the prime factors are not repeated and we don't have enough plaintext to use standard techniques based on linear algebra.

The fact we don't have an even number of digits suggests the possibility of a straddled checkerboard cipher.

---

# Positions of the double digits.
print {m.group(1): m.start() for m in re.finditer(r'((\d)\2)', cipher)}
# {'11': 1, '00': 86, '22': 65, '44': 27, '99': 17, '88': 8_, '66': 101}
# Doesn't give anything useful.

---

my_str = ""
for c in cipher:
     my_str += "{0:b}".format(ord(c)-ord('0'))

print my_str
print len(my_str)
print math.sqrt(len(my_str))

ones = [i for i in my_str if '1' == i]

print "Number of 1's %d" % len(ones)

# There are 140 ones and 139 zores in the binary sequence (without leading zeros), perhaps suggesting a maximally equiprobable sequence?

---

# Get ngrams

stride_min = 2
stride_max = 7
ngrams = [list(itertools.chain([[cipher[ofs:][i:i+s] for i in
            range(_, len(cipher[ofs:]), s)] \
            for ofs in range(_, s)]))[0] \
            for s in range(stride_min,stride_max+1)]
print ngrams

counts = dict()
for n in ngrams:
    for b in n:
        if b not in counts:
            counts[b] = 1
        else:
            counts[b] += 1
print counts

# Find any repeated n-grams
print {k:v for k,v in counts.iteritems() if v > 1}
# {'24': 3, '38': 3, '696': 2, '21': 3, '3865': 2, '88': 2, '65': 2, '6': 2, '86': 2, '85': 2}

# Given these bigraph and 4-graph appear at the following positions:
likely_candidates = ['24', '38', '21', '3865', '88', '85']
print {k: [m.start() for m in re.finditer(k, cipher)] for k in
                                                            likely_candidates}
{'24': [7, 24, 26, 68], '38': [32, 82, 96], '21': [0, 22, 62], '3865': [32, 96], '88': [15, 36, 80], '85': [37, 40, 94]}

Note that the second occurance of 3865 is at a position that is 3 times the first position.

"""

# We get closer to the true solution by noting the following:

def printIndividualPositions(new_cipher):
    print " ", new_cipher
    # Print graticules to see how the prime factors compare to periodicities.
    print " ", "".join(['|','-','-'] * 5 * 7)
    print " ", "".join(['|','-','-','-','-'] * 3 * 7)
    print " ", "".join(['|','-','-','-','-','-','-'] * 3 * 5)

    for j in range(10):
        my_map = dict()
        for i in range(10):
            if i == j:
                my_map["%d" % i] = "*"
            else:
                my_map["%d" % i] = "_"

        #print
        print j, "".join([my_map[c] for c in new_cipher])

printIndividualPositions(cipher)
print

"""
Gives:

  211412524645867889909121242443643865885085205416516626828040042142252474595796718838050062084185386586696
  |--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--|--
  |----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----|----
  |------|------|------|------|------|------|------|------|------|------|------|------|------|------|------
0 ___________________*___________________*___*_____________*_**_______________________*_**__*______________
1 _**_*________________*_*______________________*__*_____________*_______________*_____________*___________
2 *____*_*______________*_*_*_______________*_________*__*______*__**_*____________________*_______________
3 _____________________________*__*_________________________________________________*_____________*________
4 ___*____*_*______________*_**__*_____________*____________*__*__*____*_*____________________*____________
5 ______*____*_______________________*__*__*__*___*__________________*____*_*__________*_________*___*_____
6 _________*___*________________*___*____________*__**_*_______________________*__________*_________*__**_*
7 ______________*_______________________________________________________*____*__*__________________________
8 ____________*__**________________*__**__*_____________*_*_______________________**_*_______*__*__*__*____
9 _________________**_*____________________________________________________*__*__________________________*_


We clearly see patterns of increasing numbers but sadly this was as far as I could get on my own.

Notice two important parts of this pattern (my ideas about this were firmed up after I was shown the answer, when they became obvious; this sort of thing probably takes practice):
1) The angle is roughly the same as you would get if the cipher was "112233445566..." suggesting the number two or something close to it is somehow involved here.
2) There are aproximately three regions and within each region we see some peridicities (as per 2) above) that diverge more and more as we move to the right.
"""

#####################
# Actual solution.  #
#####################

# Chop up into three digit tuples. Note we have to append an "initialization vector" of sorts to make the cryptanalsys go through.
# First part suggested by: https://www.reddit.com/r/puzzles/comments/3w6ja9/gchqs_christmas_puzzler_thread_spoilers_tagged/cxvpdk4
# Second part of above suggested by: https://www.reddit.com/r/puzzles/comments/3w6ja9/gchqs_christmas_puzzler_thread_spoilers_tagged/cxwfudo
trigraphs = ['000']+[cipher[i*3:(i+1)*3] for i in range(len(cipher)/3)]

print trigraphs
print

def computePairwiseDiffs(trigraphA, trigraphB):
    # Suggested by: https://www.reddit.com/r/puzzles/comments/3w6ja9/gchqs_christmas_puzzler_thread_spoilers_tagged/cxvqb74
    ret_tupl = []
    for i in range(3):
        tA = int(trigraphA[i])
        tB = int(trigraphB[i])
        # Compute pairwise difference of trigraphs mod 10 in each digit.
        if tA >= tB:
            ret_tupl.append("%d"%((tA - tB % 10)))
        else:
            ret_tupl.append("%d"%(((tA - tB + 10)% 10)))
    return "".join(ret_tupl)

# Compute differences between consecutive trigraphs; Convert to base-3.
# The reasoning for converting to base-3 is by inspection of the differences which only contain digits with values 0,1,2.
# Suggested by: https://www.reddit.com/r/puzzles/comments/3w6ja9/gchqs_christmas_puzzler_thread_spoilers_tagged/cxvstsq
tri_diffs = [int(computePairwiseDiffs(trigraphs[i+1], trigraphs[i]), 3) \
                for i in range(len(trigraphs)-1)]
print tri_diffs

"""
Gives:
[22, 19, 14, 16, 26, 8, 15, 26, 16, 19, 18, 26, 6, 18, 24, 22, 9, 12, 20, 26, 2, 9, 12, 26, 16, 19, 8, 15, 26, 5, 8, 10, 19, 18, 12]
"""

print "Length of trigram diffs: ", len(tri_diffs)
print
# Gives 34, will is a fairly short cipher as things go.


tri_freq_dist = dofreqcounts(tri_diffs)

"""
Gives:

26: 6
19: 4
18: 3
16: 3
12: 3
 8: 3
22: 2
15: 2
 9: 2
24: 1
20: 1
14: 1
10: 1
 6: 1
 5: 1
 2: 1

Notice how we only have numbers in the range 1-26 here.

"""

# The following suggests that we now have a monoalphabetic substituion cipher (in fact it gives the substitution table): https://www.reddit.com/r/puzzles/comments/3w6ja9/gchqs_christmas_puzzler_thread_spoilers_tagged/cxwe92e


# The next thing to do is to test that we really do have a monoalphabet by computing the Index of Coincidence.
# For more details see "Elementary Cryptanalysis" by Sinkov or "Cryptography and Data Security" by Denning.

def compute_ic(dict_dist, N):
    # Compute I.C.
    index_of_coincidence = 0
    for k,v in dict_dist.iteritems():
        index_of_coincidence += (float(v)*float(v - 1)) / (float(N*(N - 1)))

    return index_of_coincidence

print
print "Index of Coincidence:", compute_ic(tri_freq_dist, len(tri_diffs))
print

# Gives 0.0605042016807 which strongly suggests an English monoalphabet.


# Based on: http://norvig.com/mayzner.html
source_alphabet_freq_dist = {
    "E": 445.2,
    "T": 330.5,
    "A": 286.5,
    "O": 272.3,
    "I": 269.7,
    "N": 257.8,
    "S": 232.1,
    "R": 223.8,
    "H": 180.1,
    "L": 145.0,
    "D": 136.0,
    "C": 119.2,
    "U": 97.3,
    "M": 89.5,
    "F": 85.6,
    "P": 76.1,
    "G": 66.6,
    "W": 59.7,
    "Y": 59.3,
    "B": 52.9,
    "V": 37.5,
    "K": 19.3,
    "X": 8.4,
    "J": 5.7,
    "Q": 4.3,
    "Z": 3.2,
}
tot = 0.0
for k,v in source_alphabet_freq_dist.iteritems():
    tot += v
#print tot

# Draw graphs to see how the distributions compare.

def drawalphagraph(alpha_dist, total, scale_factor=300.0, usealphas=False):
    # Plot the alphabet frequency distrubtion.
    for i in range(1,26):
        if usealphas:
            print chr(ord('A')+i-1),\
                '*'*int(scale_factor * (alpha_dist[chr(ord('A')+i-1)]/total))
        else:
            if i in alpha_dist:
                print chr(ord('A')+i-1),\
                    '*'*int(scale_factor * (alpha_dist[i]/total))
            else:
                print chr(ord('A')+i-1)

drawalphagraph(source_alphabet_freq_dist, tot, usealphas=True)
print

drawalphagraph(tri_freq_dist, float(len(tri_diffs)), usealphas=False)
print

"""
Gives

A ************************
B ****
C **********
D ***********
E *************************************
F *******
G *****
H ***************
I **********************
J
K *
L ************
M *******
N *********************
O **********************
P ******
Q
R ******************
S *******************
T ***************************
U ********
V ***
W *****
X
Y ****

# Note the letter designators are arbitary here, based solely on their ordinals.

A
B ********
C
D
E ********
F ********
G
H **************************
I *****************
J ********
K
L **************************
M
N ********
O *****************
P **************************
Q
R **************************
S ***********************************
T ********
U
V ********
W
X ********
Y

Since the positions of peaks don't match well I think we can be pretty sure this is no a simple shift cipher.

"""


"""

Let's guess just based on frequencies (as above we use the Norvig dataset):

Notice that the sequence 26, 16, 19 appears twice in the cipher as does the sequence 8, 15, 26. Note also that 16, 26 also appears so this pair has high affinity and we already know that are the two most frequent digrams. The bigram 19, 18 also appears twice as does 9, 12. There are no repeated 1-grams.

Sinkov suggests we first look to find the vowels since four of the more frequent letters are vowels ('A','E','I','O') and there is typically one per syllable. We know the expected average word length is about 5 letters so there should be about 7 words and say around 10 vowels. At the present point in time we have nothing to say whether or not the cipher has spaces removed. However, let's look at the relative positions of the highest frequency letter:

"""

remap = {    # Frequency
    26: '-', # 6
    19: '_', # 4
    18: '_', # 3
    16: '_', # 3
    12: '_', # 3
     8: '_', # 3
    15: '_', # 2
     9: '_', # 2
    24: '_', # 1
    22: '_', # 1
    20: '_', # 1
    14: '_', # 1
    10: '_', # 1
     6: '_', # 1
     5: '_', # 1
     2: '_', # 1
}

print
print "Substituted cipher:"
print "".join([remap[c] for c in tri_diffs])

"""

Gives:
____-__-___-_______-___-____-______

This is a strong indication that 26 is in fact the space character (there are exactly 7 words, as we predicted). Perhaps this is also suggested if we assumed letters 0-25 are the normal characters of the alphabet? In fact this idea should have been considered earlier since the letters are represented by three digit ternary numbers and 3^3 = 27.

Look again at the encodings:

[22, 19, 14, 16, -, 8, 15, -, 16, 19, 18, -, 6, 18, 24, 22, 9, 12, 20, -, 2, 9, 12, -, 16, 19, 8, 15, -, 5, 8, 10, 19, 18, 12]

Having the word divisions greatly reduces the effort required to break a cipher. In particular we can immediately attack the shortest words first.

The most common bigrams that are complete words are 'HE', 'IN', 'AN', 'ON', 'AT', 'OR', 'OF', 'IS', 'IT', 'TO', 'AS', 'ME'. In any case we know that one of 8, 15 is a vowel. But we already noticed that 8, 15 appears again in the cipher and now we know it appears at the end of the sixth word.

The most common trigrams that are complete words are 'THE', 'AND', 'FOR' 'HER', 'HIS'. Since the third word contains more frequent letters than the fifth word we guess first that 16 is 'T', 19 is 'H' and 18 is 'E'. Note also that 16, 19 is also in the sixth word.

"""

remap = {    # Frequency
    26: '-', # 6
    19: 'H', # 4
    18: 'E', # 3 Vowel
    16: 'T', # 3
    12: '_', # 3
     8: '_', # 3
    15: '_', # 2
     9: '_', # 2
    24: '_', # 1
    22: '_', # 1
    20: '_', # 1
    14: '_', # 1
    10: '_', # 1
     6: '_', # 1
     5: '_', # 1
     2: '_', # 1
}

print
print "Substituted cipher:"
print "".join([remap[c] for c in tri_diffs])

"""
This gets us to:
_H_T-__-THE-_E_____-___-TH__-___HE_
^^^^ ^^                 ^^^^

[22, H, 14, T, -, 8, 15, -, T, H, E, -, 6, E, 24, 22, 9, 12, 20, -, 2, 9, 12, -, T, H, 8, 15, -, 5, 8, 10, H, E, 12]

Next we concentrate on words 1, 2 and 6. Recall the ending of word 6 is the same as the end of word 6. This limits the possibilities for word 2 to 'IN', 'AN', 'OR', 'IS'. For now it is safe to assume that no other encoding correspons to 'T' (in addition to 16 we found) so we discard 'IT' and 'AT' as possibilities. Given that this word comes before 'THE' we further reduce the reasonable possibilities down to 'IN', 'OR', 'IS'. Now we consider the alternatives for the first word:

_H_T-IN-THE-
_H_T-OR-THE-
_H_T-IS-THE-

Probable words for word one are:
CHAT   Verb/Noun
CHIT   Verb/Noun
SHOT   Noun
SHUT   Verb/Adj/Noun
THAT   Pronoun
WHAT   Pronoun
WHET   Verb
WHIT   Noun

By the same reasoning for 'E' given above we can assume that we've already found 'T' and discount the 'THAT' option. Of these, 'OR' is starting to make the least sense. Certainly 14 is a vowel. These now are the most probable:

SHOT-IN-THE-
WHAT-IS-THE-

Since the forth word is not going to be 'DARK' I'm more inclined to the last option. Lets try this and see what we get:

"""

remap = {    # Frequency
    26: '-', # 6
    19: 'H', # 4 Vowel
    18: 'E', # 3
    16: 'T', # 3
    12: '_', # 3
     8: 'I', # 3 Vowel
    15: 'S', # 2
     9: '_', # 2
    24: '_', # 1
    22: 'W', # 1
    20: '_', # 1
    14: 'A', # 1 Vowel
    10: '_', # 1
     6: '_', # 1
     5: '_', # 1
     2: '_', # 1
}

print
print "Substituted cipher:"
print "".join([remap[c] for c in tri_diffs])

"""
We now have:

WHAT-IS-THE-_E_W___-___-THIS-_I_HE_
                             ^^^^^^

[W, H, A, T, -, I, S, -, T, H, E, -, 6, E, 24, W, 9, 12, 20, -, 2, 9, 12, -, T, H, I, S, -, 5, I, 10, H, E, 12]

I next egrep my dictionary with "^.iphe.$" and this results in a unique word "CIPHER". I'm pretty happy about this :) Let's now try these new letters in our mapping:
"""

remap = {    # Frequency
    26: '-', # 6
    19: 'H', # 4 Vowel
    18: 'E', # 3
    16: 'T', # 3
    12: 'R', # 3
     8: 'I', # 3 Vowel
    15: 'S', # 2
     9: '_', # 2
    24: '_', # 1
    22: 'W', # 1
    20: '_', # 1
    14: 'A', # 1 Vowel
    10: 'P', # 1
     6: '_', # 1
     5: 'C', # 1
     2: '_', # 1
}

print
print "Substituted cipher:"
print "".join([remap[c] for c in tri_diffs])

"""
We now have:
WHAT-IS-THE-_E_W_R_-__R-THIS-CIPHER
            ^^^^^^^

[W, H, A, T, -, I, S, -, T, H, E, -, 6, E, 24, W, 9, R, 20, -, 2, 9, R, -, T, H, I, S, -, C, I, P, H, E, R]

Again I egrep my dictionary with "^.e.w.r.$" and this results in:
eelworm
keyword  <-- I like this one :)
leeward
legwork
network  <-- also possible
seaward
seaware

Let's now try 'KEYWORD' in our mapping:
"""


remap = {    # Frequency
    26: '-', # 6
    19: 'H', # 4 Vowel
    18: 'E', # 3
    16: 'T', # 3
    12: 'R', # 3
     8: 'I', # 3 Vowel
    15: 'S', # 2
     9: 'O', # 2 Vowel
    24: 'Y', # 1
    22: 'W', # 1
    20: 'D', # 1
    14: 'A', # 1 Vowel
    10: 'P', # 1
     6: 'K', # 1
     5: 'C', # 1
     2: '_', # 1
}

print
print "Substituted cipher:"
print "".join([remap[c] for c in tri_diffs])

"""
We're almost there:
WHAT-IS-THE-KEYWORD-_OR-THIS-CIPHER
                    ^^^

[W, H, A, T, -, I, S, -, T, H, E, -, K, E, Y, W, O, R, D, -, 2, 0, R, -, T, H, I, S, -, C, I, P, H, E, R]

The only missing letter is 2 which seems pretty clear now is 'F':
"""

remap = {    # Frequency
    26: '-', # 6
    19: 'H', # 4
    18: 'E', # 3 Vowel
    16: 'T', # 3
    12: 'R', # 3
     8: 'I', # 3 Vowel
    15: 'S', # 2
     9: 'O', # 2 Vowel
    24: 'Y', # 1
    22: 'W', # 1
    20: 'D', # 1
    14: 'A', # 1 Vowel
    10: 'P', # 1
     6: 'K', # 1
     5: 'C', # 1
     2: 'F', # 1
}

print
print "Substituted cipher:"
print "".join([remap[c] for c in tri_diffs])


"""
Finally we get the plaintext:

WHAT-IS-THE-KEYWORD-FOR-THIS-CIPHER

Clearly we're not done yet! Notice how we have 9 vowels, very close to our prediction.
"""


"""
From what we discovered above we know that the cipher alphabet is not just a shifted plain alphabet. Next we print out the cipher alphabet below the plain alphabet (in normal) sequence to see if we can spot any patterns.
"""

print

print remap
# Suggested by: https://www.reddit.com/r/puzzles/comments/3w6ja9/gchqs_christmas_puzzler_thread_spoilers_tagged/cxwmelf
enc_map = dict()
for k,v in remap.iteritems():
    if k != 26: # Ignore space character?
        enc_map[v] = chr(ord('A')+k) # Invert mapping.

print enc_map

# Plain alphabet.
out_str = ""
for c in [chr(ord('A')+i) for i in range(26)]:
    out_str += c
print out_str

# Cipher alphabet.
out_str = ""
for c in [chr(ord('A')+i) for i in range(26)]:
    if c in enc_map:
        out_str += enc_map[c]
    else:
        out_str += '_'
print out_str

"""
Gives:

ABCDEFGHIJKLMNOPQRSTUVWXYZ
O_FUSC_TI_G___JK_MPQ__W_Y_

In some types of substitution ciphers we'd expect the low frequency letters WXYZ at the end of the alphabet to line up as typically the keyword would appear at the beginning of the plain alphbet and the remaining letters at the end in normal sequence. That looks like what's going on here too.

If it's not already clear what the possible keyword is we can egrep our dictionary for "^o.fusc" and get:
obfuscate
obfuscable
obfuscated
obfuscates
obfuscator
obfuscating   <-- this one matches the remaining letters in the cipher alphabet
obfuscation
obfuscators
obfuscatory

We can now fill in the remaining letters of the cipher alphabet sequence:

ABCDEFGHIJKLMNOPQRSTUVWXYZ
OBFUSCATINGDEHJKLMPQRVWXYZ

The keyword is therefore:

OBFUSCATING

"""
	#!/usr/bin/evn python
	"""
	Cryptanalysis of "Cipher" puzzle, Stage 5, Director GCHQ's Christmas Puzzle 2015
	================================================================================

	http://www.gchq.gov.uk/press_and_media/news_and_features/Pages/Director%27s-Christmas-puzzle-2015.aspx
	Puzzle Crown Copyright.

	SPOILER ALERT - If you're attempting the puzzle and you don't want to find out
	any hints do not read ahead!

	Below is an annoted account of how to cryptanalyse this cipher. Major clues all come from reddit threads (URLs cited) as my own efforts came to nothing.

	-----------

	The MIT License (MIT)
	Copyright (c) 2015-2016 Dan Hagon.

	Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

	The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
	"""

	import re
	import math
	import itertools

	cipher = "21141252464586788990912124244364386588508520541651662"\
	"6828040042142252474595796718838050062084185386586696"
	print cipher

	##################
	# Initial ideas. #
	##################

	# Gives 105.
	print "Length of cipher", len(cipher)
	print

	def dofreqcounts(mylist):
	# Count frequencies and print in order.
	freq_dist = dict()
	for i in range(len(mylist)):
	if not mylist[i] in freq_dist:
	freq_dist[mylist[i]] = 1
	else:
	freq_dist[mylist[i]] += 1
	#print freq_dist
	my_sorted = sorted(freq_dist, key=freq_dist.__getitem__)
	my_sorted.reverse()
	for m in my_sorted:
	print "%2d:" % int(m), freq_dist[m]
	return freq_dist

	dofreqcounts(cipher)
	print

	"""
	Gives:
	8: 16
	6: 14
	4: 14
	2: 14
	5: 13
	0: 10
	1: 10
	9: 6
	7: 4
	3: 4

	Chance of getting an even number is about 64.7%

	---

	Interesting that the prime factors of 105 are:
	>>> (3)(5)(7)
	105
	possibly suggesting plaintext lengths of:
	157, 215, 35*3

	3 digits per plaintext character would be consistent with using an ASCII decimal encoding.

	We have to assume the plaintext alphabet is {A-Z}* only since this much is hinted at in the problem description.

	The Hill cipher is not a possibility with sizes 33, 55 or 7*7 as the prime factors are not repeated and we don't have enough plaintext to use standard techniques based on linear algebra.

	The fact we don't have an even number of digits suggests the possibility of a straddled checkerboard cipher.

	---

	# Positions of the double digits.
	print {m.group(1): m.start() for m in re.finditer(r'((\d)\2)', cipher)}
	# {'11': 1, '00': 86, '22': 65, '44': 27, '99': 17, '88': 8_, '66': 101}
	# Doesn't give anything useful.

	---

	my_str = ""
	for c in cipher:
	my_str += "{0:b}".format(ord(c)-ord('0'))

	print my_str
	print len(my_str)
	print math.sqrt(len(my_str))

	ones = [i for i in my_str if '1' == i]

	print "Number of 1's %d" % len(ones)

	# There are 140 ones and 139 zores in the binary sequence (without leading zeros), perhaps suggesting a maximally equiprobable sequence?

	---

	# Get ngrams

	stride_min = 2
	stride_max = 7
	ngrams = [list(itertools.chain([[cipher[ofs:][i:i+s] for i in
	range(_, len(cipher[ofs:]), s)] \
	for ofs in range(_, s)]))[0] \
	for s in range(stride_min,stride_max+1)]
	print ngrams

	counts = dict()
	for n in ngrams:
	for b in n:
	if b not in counts:
	counts[b] = 1
	else:
	counts[b] += 1
	print counts

	# Find any repeated n-grams
	print {k:v for k,v in counts.iteritems() if v > 1}
	# {'24': 3, '38': 3, '696': 2, '21': 3, '3865': 2, '88': 2, '65': 2, '6': 2, '86': 2, '85': 2}

	# Given these bigraph and 4-graph appear at the following positions:
	likely_candidates = ['24', '38', '21', '3865', '88', '85']
	print {k: [m.start() for m in re.finditer(k, cipher)] for k in
	likely_candidates}
	{'24': [7, 24, 26, 68], '38': [32, 82, 96], '21': [0, 22, 62], '3865': [32, 96], '88': [15, 36, 80], '85': [37, 40, 94]}

	Note that the second occurance of 3865 is at a position that is 3 times the first position.

	"""

	# We get closer to the true solution by noting the following:

	def printIndividualPositions(new_cipher):
	print " ", new_cipher
	# Print graticules to see how the prime factors compare to periodicities.
	print " ", "".join(['\|','-','-'] * 5 * 7)
	print " ", "".join(['\|','-','-','-','-'] * 3 * 7)
	print " ", "".join(['\|','-','-','-','-','-','-'] * 3 * 5)

	for j in range(10):
	my_map = dict()
	for i in range(10):
	if i == j:
	my_map["%d" % i] = "*"
	else:
	my_map["%d" % i] = "_"

	#print
	print j, "".join([my_map[c] for c in new_cipher])

	printIndividualPositions(cipher)
	print

	"""
	Gives:

	211412524645867889909121242443643865885085205416516626828040042142252474595796718838050062084185386586696
	\|--\|--\|--\|--\|--\|--\|--\|--\|--\|--\|--\|--\|--\|--\|--\|--\|--\|--\|--\|--\|--\|--\|--\|--\|--\|--\|--\|--\|--\|--\|--\|--\|--\|--\|--
	\|----\|----\|----\|----\|----\|----\|----\|----\|----\|----\|----\|----\|----\|----\|----\|----\|----\|----\|----\|----\|----
	\|------\|------\|------\|------\|------\|------\|------\|------\|------\|------\|------\|------\|------\|------\|------
	0 _______________________________________________________*________________________*________________
	1 _*___________________________________________________________________________________*___________
	2 _______________________________________________________*_____________________*_______________
	3 _____________________________________________________________________________________________________
	4 _______________________*________________________________________________________*____________
	5 _______________________________________________________________________________________*_____
	6 ___________________________________________________________________________________________
	7 _____________________________________________________________________________________________________
	8 ________________________________*_______________________________________*__________________
	9 _________________*_________________________________________________________________________________*_


	We clearly see patterns of increasing numbers but sadly this was as far as I could get on my own.

	Notice two important parts of this pattern (my ideas about this were firmed up after I was shown the answer, when they became obvious; this sort of thing probably takes practice):
	1) The angle is roughly the same as you would get if the cipher was "112233445566..." suggesting the number two or something close to it is somehow involved here.
	2) There are aproximately three regions and within each region we see some peridicities (as per 2) above) that diverge more and more as we move to the right.
	"""

	#####################
	# Actual solution. #
	#####################

	# Chop up into three digit tuples. Note we have to append an "initialization vector" of sorts to make the cryptanalsys go through.
	# First part suggested by: https://www.reddit.com/r/puzzles/comments/3w6ja9/gchqs_christmas_puzzler_thread_spoilers_tagged/cxvpdk4
	# Second part of above suggested by: https://www.reddit.com/r/puzzles/comments/3w6ja9/gchqs_christmas_puzzler_thread_spoilers_tagged/cxwfudo
	trigraphs = ['000']+[cipher[i3:(i+1)3] for i in range(len(cipher)/3)]

	print trigraphs
	print

	def computePairwiseDiffs(trigraphA, trigraphB):
	# Suggested by: https://www.reddit.com/r/puzzles/comments/3w6ja9/gchqs_christmas_puzzler_thread_spoilers_tagged/cxvqb74
	ret_tupl = []
	for i in range(3):
	tA = int(trigraphA[i])
	tB = int(trigraphB[i])
	# Compute pairwise difference of trigraphs mod 10 in each digit.
	if tA >= tB:
	ret_tupl.append("%d"%((tA - tB % 10)))
	else:
	ret_tupl.append("%d"%(((tA - tB + 10)% 10)))
	return "".join(ret_tupl)

	# Compute differences between consecutive trigraphs; Convert to base-3.
	# The reasoning for converting to base-3 is by inspection of the differences which only contain digits with values 0,1,2.
	# Suggested by: https://www.reddit.com/r/puzzles/comments/3w6ja9/gchqs_christmas_puzzler_thread_spoilers_tagged/cxvstsq
	tri_diffs = [int(computePairwiseDiffs(trigraphs[i+1], trigraphs[i]), 3) \
	for i in range(len(trigraphs)-1)]
	print tri_diffs

	"""
	Gives:
	[22, 19, 14, 16, 26, 8, 15, 26, 16, 19, 18, 26, 6, 18, 24, 22, 9, 12, 20, 26, 2, 9, 12, 26, 16, 19, 8, 15, 26, 5, 8, 10, 19, 18, 12]
	"""

	print "Length of trigram diffs: ", len(tri_diffs)
	print
	# Gives 34, will is a fairly short cipher as things go.


	tri_freq_dist = dofreqcounts(tri_diffs)

	"""
	Gives:

	26: 6
	19: 4
	18: 3
	16: 3
	12: 3
	8: 3
	22: 2
	15: 2
	9: 2
	24: 1
	20: 1
	14: 1
	10: 1
	6: 1
	5: 1
	2: 1

	Notice how we only have numbers in the range 1-26 here.

	"""

	# The following suggests that we now have a monoalphabetic substituion cipher (in fact it gives the substitution table): https://www.reddit.com/r/puzzles/comments/3w6ja9/gchqs_christmas_puzzler_thread_spoilers_tagged/cxwe92e


	# The next thing to do is to test that we really do have a monoalphabet by computing the Index of Coincidence.
	# For more details see "Elementary Cryptanalysis" by Sinkov or "Cryptography and Data Security" by Denning.

	def compute_ic(dict_dist, N):
	# Compute I.C.
	index_of_coincidence = 0
	for k,v in dict_dist.iteritems():
	index_of_coincidence += (float(v)float(v - 1)) / (float(N(N - 1)))

	return index_of_coincidence

	print
	print "Index of Coincidence:", compute_ic(tri_freq_dist, len(tri_diffs))
	print

	# Gives 0.0605042016807 which strongly suggests an English monoalphabet.


	# Based on: http://norvig.com/mayzner.html
	source_alphabet_freq_dist = {
	"E": 445.2,
	"T": 330.5,
	"A": 286.5,
	"O": 272.3,
	"I": 269.7,
	"N": 257.8,
	"S": 232.1,
	"R": 223.8,
	"H": 180.1,
	"L": 145.0,
	"D": 136.0,
	"C": 119.2,
	"U": 97.3,
	"M": 89.5,
	"F": 85.6,
	"P": 76.1,
	"G": 66.6,
	"W": 59.7,
	"Y": 59.3,
	"B": 52.9,
	"V": 37.5,
	"K": 19.3,
	"X": 8.4,
	"J": 5.7,
	"Q": 4.3,
	"Z": 3.2,
	}
	tot = 0.0
	for k,v in source_alphabet_freq_dist.iteritems():
	tot += v
	#print tot

	# Draw graphs to see how the distributions compare.

	def drawalphagraph(alpha_dist, total, scale_factor=300.0, usealphas=False):
	# Plot the alphabet frequency distrubtion.
	for i in range(1,26):
	if usealphas:
	print chr(ord('A')+i-1),\
	''int(scale_factor * (alpha_dist[chr(ord('A')+i-1)]/total))
	else:
	if i in alpha_dist:
	print chr(ord('A')+i-1),\
	''int(scale_factor * (alpha_dist[i]/total))
	else:
	print chr(ord('A')+i-1)

	drawalphagraph(source_alphabet_freq_dist, tot, usealphas=True)
	print

	drawalphagraph(tri_freq_dist, float(len(tri_diffs)), usealphas=False)
	print

	"""
	Gives

	A ************************
	B ****
	C **********
	D ***********
	E *************************************
	F *******
	G *****
	H ***************
	I **********************
	J
	K *
	L ************
	M *******
	N *********************
	O **********************
	P ******
	Q
	R ******************
	S *******************
	T ***************************
	U ********
	V ***
	W *****
	X
	Y ****

	# Note the letter designators are arbitary here, based solely on their ordinals.

	A
	B ********
	C
	D
	E ********
	F ********
	G
	H **************************
	I *****************
	J ********
	K
	L **************************
	M
	N ********
	O *****************
	P **************************
	Q
	R **************************
	S ***********************************
	T ********
	U
	V ********
	W
	X ********
	Y

	Since the positions of peaks don't match well I think we can be pretty sure this is no a simple shift cipher.

	"""


	"""

	Let's guess just based on frequencies (as above we use the Norvig dataset):

	Notice that the sequence 26, 16, 19 appears twice in the cipher as does the sequence 8, 15, 26. Note also that 16, 26 also appears so this pair has high affinity and we already know that are the two most frequent digrams. The bigram 19, 18 also appears twice as does 9, 12. There are no repeated 1-grams.

	Sinkov suggests we first look to find the vowels since four of the more frequent letters are vowels ('A','E','I','O') and there is typically one per syllable. We know the expected average word length is about 5 letters so there should be about 7 words and say around 10 vowels. At the present point in time we have nothing to say whether or not the cipher has spaces removed. However, let's look at the relative positions of the highest frequency letter:

	"""

	remap = { # Frequency
	26: '-', # 6
	19: '_', # 4
	18: '_', # 3
	16: '_', # 3
	12: '_', # 3
	8: '_', # 3
	15: '_', # 2
	9: '_', # 2
	24: '_', # 1
	22: '_', # 1
	20: '_', # 1
	14: '_', # 1
	10: '_', # 1
	6: '_', # 1
	5: '_', # 1
	2: '_', # 1
	}

	print
	print "Substituted cipher:"
	print "".join([remap[c] for c in tri_diffs])

	"""

	Gives:
	____-__-___-_______-___-____-______

	This is a strong indication that 26 is in fact the space character (there are exactly 7 words, as we predicted). Perhaps this is also suggested if we assumed letters 0-25 are the normal characters of the alphabet? In fact this idea should have been considered earlier since the letters are represented by three digit ternary numbers and 3^3 = 27.

	Look again at the encodings:

	[22, 19, 14, 16, -, 8, 15, -, 16, 19, 18, -, 6, 18, 24, 22, 9, 12, 20, -, 2, 9, 12, -, 16, 19, 8, 15, -, 5, 8, 10, 19, 18, 12]

	Having the word divisions greatly reduces the effort required to break a cipher. In particular we can immediately attack the shortest words first.

	The most common bigrams that are complete words are 'HE', 'IN', 'AN', 'ON', 'AT', 'OR', 'OF', 'IS', 'IT', 'TO', 'AS', 'ME'. In any case we know that one of 8, 15 is a vowel. But we already noticed that 8, 15 appears again in the cipher and now we know it appears at the end of the sixth word.

	The most common trigrams that are complete words are 'THE', 'AND', 'FOR' 'HER', 'HIS'. Since the third word contains more frequent letters than the fifth word we guess first that 16 is 'T', 19 is 'H' and 18 is 'E'. Note also that 16, 19 is also in the sixth word.

	"""

	remap = { # Frequency
	26: '-', # 6
	19: 'H', # 4
	18: 'E', # 3 Vowel
	16: 'T', # 3
	12: '_', # 3
	8: '_', # 3
	15: '_', # 2
	9: '_', # 2
	24: '_', # 1
	22: '_', # 1
	20: '_', # 1
	14: '_', # 1
	10: '_', # 1
	6: '_', # 1
	5: '_', # 1
	2: '_', # 1
	}

	print
	print "Substituted cipher:"
	print "".join([remap[c] for c in tri_diffs])

	"""
	This gets us to:
	_H_T-__-THE-_E_____-___-TH__-___HE_
	^^^^ ^^ ^^^^

	[22, H, 14, T, -, 8, 15, -, T, H, E, -, 6, E, 24, 22, 9, 12, 20, -, 2, 9, 12, -, T, H, 8, 15, -, 5, 8, 10, H, E, 12]

	Next we concentrate on words 1, 2 and 6. Recall the ending of word 6 is the same as the end of word 6. This limits the possibilities for word 2 to 'IN', 'AN', 'OR', 'IS'. For now it is safe to assume that no other encoding correspons to 'T' (in addition to 16 we found) so we discard 'IT' and 'AT' as possibilities. Given that this word comes before 'THE' we further reduce the reasonable possibilities down to 'IN', 'OR', 'IS'. Now we consider the alternatives for the first word:

	_H_T-IN-THE-
	_H_T-OR-THE-
	_H_T-IS-THE-

	Probable words for word one are:
	CHAT Verb/Noun
	CHIT Verb/Noun
	SHOT Noun
	SHUT Verb/Adj/Noun
	THAT Pronoun
	WHAT Pronoun
	WHET Verb
	WHIT Noun

	By the same reasoning for 'E' given above we can assume that we've already found 'T' and discount the 'THAT' option. Of these, 'OR' is starting to make the least sense. Certainly 14 is a vowel. These now are the most probable:

	SHOT-IN-THE-
	WHAT-IS-THE-

	Since the forth word is not going to be 'DARK' I'm more inclined to the last option. Lets try this and see what we get:

	"""

	remap = { # Frequency
	26: '-', # 6
	19: 'H', # 4 Vowel
	18: 'E', # 3
	16: 'T', # 3
	12: '_', # 3
	8: 'I', # 3 Vowel
	15: 'S', # 2
	9: '_', # 2
	24: '_', # 1
	22: 'W', # 1
	20: '_', # 1
	14: 'A', # 1 Vowel
	10: '_', # 1
	6: '_', # 1
	5: '_', # 1
	2: '_', # 1
	}

	print
	print "Substituted cipher:"
	print "".join([remap[c] for c in tri_diffs])

	"""
	We now have:

	WHAT-IS-THE-_E_W___-___-THIS-_I_HE_
	^^^^^^

	[W, H, A, T, -, I, S, -, T, H, E, -, 6, E, 24, W, 9, 12, 20, -, 2, 9, 12, -, T, H, I, S, -, 5, I, 10, H, E, 12]

	I next egrep my dictionary with "^.iphe.$" and this results in a unique word "CIPHER". I'm pretty happy about this :) Let's now try these new letters in our mapping:
	"""

	remap = { # Frequency
	26: '-', # 6
	19: 'H', # 4 Vowel
	18: 'E', # 3
	16: 'T', # 3
	12: 'R', # 3
	8: 'I', # 3 Vowel
	15: 'S', # 2
	9: '_', # 2
	24: '_', # 1
	22: 'W', # 1
	20: '_', # 1
	14: 'A', # 1 Vowel
	10: 'P', # 1
	6: '_', # 1
	5: 'C', # 1
	2: '_', # 1
	}

	print
	print "Substituted cipher:"
	print "".join([remap[c] for c in tri_diffs])

	"""
	We now have:
	WHAT-IS-THE-_E_W_R_-__R-THIS-CIPHER
	^^^^^^^

	[W, H, A, T, -, I, S, -, T, H, E, -, 6, E, 24, W, 9, R, 20, -, 2, 9, R, -, T, H, I, S, -, C, I, P, H, E, R]

	Again I egrep my dictionary with "^.e.w.r.$" and this results in:
	eelworm
	keyword <-- I like this one :)
	leeward
	legwork
	network <-- also possible
	seaward
	seaware

	Let's now try 'KEYWORD' in our mapping:
	"""


	remap = { # Frequency
	26: '-', # 6
	19: 'H', # 4 Vowel
	18: 'E', # 3
	16: 'T', # 3
	12: 'R', # 3
	8: 'I', # 3 Vowel
	15: 'S', # 2
	9: 'O', # 2 Vowel
	24: 'Y', # 1
	22: 'W', # 1
	20: 'D', # 1
	14: 'A', # 1 Vowel
	10: 'P', # 1
	6: 'K', # 1
	5: 'C', # 1
	2: '_', # 1
	}

	print
	print "Substituted cipher:"
	print "".join([remap[c] for c in tri_diffs])

	"""
	We're almost there:
	WHAT-IS-THE-KEYWORD-_OR-THIS-CIPHER
	^^^

	[W, H, A, T, -, I, S, -, T, H, E, -, K, E, Y, W, O, R, D, -, 2, 0, R, -, T, H, I, S, -, C, I, P, H, E, R]

	The only missing letter is 2 which seems pretty clear now is 'F':
	"""

	remap = { # Frequency
	26: '-', # 6
	19: 'H', # 4
	18: 'E', # 3 Vowel
	16: 'T', # 3
	12: 'R', # 3
	8: 'I', # 3 Vowel
	15: 'S', # 2
	9: 'O', # 2 Vowel
	24: 'Y', # 1
	22: 'W', # 1
	20: 'D', # 1
	14: 'A', # 1 Vowel
	10: 'P', # 1
	6: 'K', # 1
	5: 'C', # 1
	2: 'F', # 1
	}

	print
	print "Substituted cipher:"
	print "".join([remap[c] for c in tri_diffs])


	"""
	Finally we get the plaintext:

	WHAT-IS-THE-KEYWORD-FOR-THIS-CIPHER

	Clearly we're not done yet! Notice how we have 9 vowels, very close to our prediction.
	"""


	"""
	From what we discovered above we know that the cipher alphabet is not just a shifted plain alphabet. Next we print out the cipher alphabet below the plain alphabet (in normal) sequence to see if we can spot any patterns.
	"""

	print

	print remap
	# Suggested by: https://www.reddit.com/r/puzzles/comments/3w6ja9/gchqs_christmas_puzzler_thread_spoilers_tagged/cxwmelf
	enc_map = dict()
	for k,v in remap.iteritems():
	if k != 26: # Ignore space character?
	enc_map[v] = chr(ord('A')+k) # Invert mapping.

	print enc_map

	# Plain alphabet.
	out_str = ""
	for c in [chr(ord('A')+i) for i in range(26)]:
	out_str += c
	print out_str

	# Cipher alphabet.
	out_str = ""
	for c in [chr(ord('A')+i) for i in range(26)]:
	if c in enc_map:
	out_str += enc_map[c]
	else:
	out_str += '_'
	print out_str

	"""
	Gives:

	ABCDEFGHIJKLMNOPQRSTUVWXYZ
	O_FUSC_TI_G___JK_MPQ__W_Y_

	In some types of substitution ciphers we'd expect the low frequency letters WXYZ at the end of the alphabet to line up as typically the keyword would appear at the beginning of the plain alphbet and the remaining letters at the end in normal sequence. That looks like what's going on here too.

	If it's not already clear what the possible keyword is we can egrep our dictionary for "^o.fusc" and get:
	obfuscate
	obfuscable
	obfuscated
	obfuscates
	obfuscator
	obfuscating <-- this one matches the remaining letters in the cipher alphabet
	obfuscation
	obfuscators
	obfuscatory

	We can now fill in the remaining letters of the cipher alphabet sequence:

	ABCDEFGHIJKLMNOPQRSTUVWXYZ
	OBFUSCATINGDEHJKLMPQRVWXYZ

	The keyword is therefore:

	OBFUSCATING

	"""