enigmaticape/freq.py

## readme.txt
Some Python scripts used in exploration of the Pigeon Code, a WW2 era
UK code found attached to the leg of a dead pigeon in December 2012.

There is a lot of shared code between these scripts, and they are of, er,
varying quality as I knocked them up on a Saturday afternoon for the sole
purpose of hacking away on the Pigeon Code.

Shared in the spirit of making my rather limited investigation repeatable.

See the associated blog post at http://www.enigmaticape.com/blog/pigeon-code-some-idle-speculation-with-graphs/

## freq.py
import os
import sys
import codecs
import argparse

parser = argparse.ArgumentParser("Count frequency of letters in ciphertext")

parser.add_argument("-i", "--input" ,   help="input file",
                                        action="store")

parser.add_argument("-o", "--output",   help="output file",
                                        action="store")

parser.add_argument("-e", "--encoding", help="utf-8 | ascii",
                                        action="store",
                                        default="utf-8")

parser.add_argument("-l", "--locale",   help="locale name",
                                        action="store")

# NB that if you do use additional symbols, you are going
# to have no fun at all if one of them is a comma.
# this issue will be addressed in a future update
parser.add_argument("-a", "--addsyms",  help="additional symbols",
                                        action="store")

args = parser.parse_args()

if args.addsyms == None:
    args.addsyms = []

# determine whether to read input from a file or STDIN
if args.input :
    # file
    if not os.path.exists( args.input ):
        sys.stderr.write( args.input + "\nFile not found :-( \n" )
        sys.exit()

    cipher = codecs.open( args.input, 'rb', encoding=args.encoding )
else:
    if sys.stdin.isatty():
        cipher = codecs.getreader( sys.stdin.encoding )(sys.stdin)
    else:
        cipher = codecs.getreader(       args.encoding )(sys.stdin)


# read the file
cipher_text = cipher.read()

# count letters
counts = {}
for char in cipher_text.upper():
    if char.isalpha() or char in args.addsyms:
        if char in counts:
            counts[ char ] += 1
        else:
            counts[ char ] =  1


labels, data = [
                 [ tuple[ i ] for tuple in sorted(counts.items()) ]
                 for i in (0,1)
               ]

# determine if we're using stdout or an output file

if args.output :
    output = codecs.open( args.output, "w", args.encoding )
else:
    # if we're using stdout, there are some pesky encoding issues
    # to deal with. We default to UTF 8. Your TTY should be UTF8.
    if sys.stdout.isatty():
        output = codecs.getwriter( sys.stdout.encoding )(sys.stdout)
    else:
        output = codecs.getwriter(       args.encoding )(sys.stdout)


# write the data out on two lines, basically as a CSV
output.write( ",".join(labels) +"\n" )
output.write( ",".join(str(d) for d in data) +"\n" )
output.close

## grid.py
import os
import sys
import codecs
import argparse

parser = argparse.ArgumentParser("Grid some text into 5x5 blocks")

parser.add_argument("-i", "--input" ,   help="input file",
                                        action="store")

parser.add_argument("-o", "--output",   help="Output file",
                                        action="store" )

parser.add_argument("-e", "--encoding", help="utf-8 | ascii",
                                        action="store",
                                        default="utf-8")


args = parser.parse_args()

# determine whether to read input from a file or STDIN
if args.input :
    # file
    if not os.path.exists( args.input ):
        sys.stderr.write( args.input + "\nFile not found :-( \n" )
        sys.exit()

    histo_file = codecs.open( args.input, 'rb', encoding=args.encoding )
else:
    if sys.stdin.isatty():
        histo_file = codecs.getreader( sys.stdin.encoding )(sys.stdin)
    else:
        histo_file = codecs.getreader(       args.encoding )(sys.stdin)

# determine if we're using stdout or an output file

if args.output :
    output = codecs.open( args.output, "w", args.encoding )
else:
    # if we're using stdout, there are some pesky encoding issues
    # to deal with. We default to UTF 8. Your TTY should be UTF8.
    if sys.stdout.isatty():
        output = codecs.getwriter( sys.stdout.encoding )(sys.stdout)
    else:
        output = codecs.getwriter(       args.encoding )(sys.stdout)


letters = histo_file.read()

count = 0
grid  = []
group = []

for char in [ c for c in letters.upper() if c.isalpha() ]:
    if count < 5:
        group.append( char )
        count += 1
    else:
        grid.append( group )
        print group
        group = []
        group.append( char )
        count = 1
if count > 0:
    grid.append( group )

for five in (grid[pos:pos + 5] for pos in xrange(0, len(grid), 5)):
    for one in five:
        output.write( "".join(one) + " " )
    output.write("\n")

output.close()

## histo.py
# Given a two line input of comma seperated values,
# draws a bar chart assuming the first line is labels
# and the second line is counts.

# Requires reportlab and PIP

import sys
import os


# easy_install PIP
# easy_install reportlab
from   reportlab.graphics.shapes import Drawing
from   reportlab.graphics.charts.barcharts import VerticalBarChart


import codecs
import argparse

parser = argparse.ArgumentParser("Draw a histogram")

parser.add_argument("-i", "--input" ,   help="input file",
                                        action="store")

parser.add_argument("-o", "--output",   help="output file base name",
                                        action="store",
                                        default="histo")

parser.add_argument("-e", "--encoding", help="utf-8 | ascii",
                                        action="store",
                                        default="utf-8")

parser.add_argument("-s", "--step",     help="value axis step",
                                        action="store",
                                        default=1)

args = parser.parse_args()

# determine whether to read input from a file or STDIN
if args.input :
    # file
    if not os.path.exists( args.input ):
        sys.stderr.write( args.input + "\nFile not found :-( \n" )
        sys.exit()

    histo_file = codecs.open( args.input, 'rb', encoding=args.encoding )
else:
    if sys.stdin.isatty():
        histo_file = codecs.getreader( sys.stdin.encoding )(sys.stdin)
    else:
        histo_file = codecs.getreader(       args.encoding )(sys.stdin)


label_data = histo_file.readline()
data_data  = histo_file.readline()

labels = label_data.strip().split(",")
data   = [int(i) for i in data_data.strip().split(",")]

drawing      = Drawing(600, 500)
chart        = VerticalBarChart()
chart.width  = 560
chart.height = 460
chart.x      = 20
chart.y      = 20

chart.data   = [data]
chart.categoryAxis.categoryNames = labels
chart.valueAxis.valueMin  = 0
chart.valueAxis.valueStep = int( args.step )
drawing.add( chart )
drawing.save( fnRoot= args.output, formats=['png'] )

## polycipher.py
import os
import sys
import argparse
import collections

# About the simplest polyalphabetic cipher you can get.

parser = argparse.ArgumentParser("Encipher some text with a simple polyalpbetic cipher")
parser.add_argument( "text", action="store")
parser.add_argument( "key",  action="store")
parser.add_argument( "-e", "--encipher", action="store_true")
parser.add_argument( "-d", "--decipher", action="store_true")
args = parser.parse_args()


outtext  = ""

alphabet = [ chr( c ) for c in range( ord("A"), ord("Z") ) ]
keyshift = collections.deque( args.key.upper() )

intext   = args.text.upper()

for start in intext:
    if start.isalpha():

        cipherbet = collections.deque( alphabet )
        cipherbet.rotate( ord( keyshift[0] ) )
        keyshift.rotate( 1 )

        if args.encipher:
            index  = alphabet.index( start )
            out    = cipherbet[ index ]
        if args.decipher:
            index = list(cipherbet).index( start )
            out   = alphabet[ index ]

        outtext += out
    else:
        outtext += start

print outtext

## scrape.py
# scrape textual data from a webpage and write it
# to a file or stdout

import os
import sys
import codecs
import argparse

# easy_install lxml
from   lxml import html
from   lxml.html.clean import clean_html


# process command line args
parser = argparse.ArgumentParser("Scrape text from a url")

parser.add_argument("url",              help = "http://some.website.com",
                                        action="store")

parser.add_argument("-o", "--output",   help="output file",
                                        action="store")

parser.add_argument("-e", "--encoding", help="utf-8 | ascii",
                                        default="utf-8",
                                        action="store")

parser.add_argument("-l", "--locale",   help="locale name",
                                        action="store")

args = parser.parse_args()

# set the locale if one was provided
if args.locale :
    locale.setlocale( locale.LC_ALL, args.locale )

# get and clean the HTML
tree = html.parse( args.url )
tree = clean_html( tree )
text = tree.getroot().text_content()

# determine if we're using stdout or an output file

if args.output :
    output = codecs.open( args.output, "w", args.encoding )
else:
    # if we're using stdout, there are some pesky encoding issues
    # to deal with. We default to UTF 8. Your TTY should be UTF8.
    if sys.stdout.isatty():
        output = codecs.getwriter( sys.stdout.encoding )(sys.stdout)
    else:
        output = codecs.getwriter(       args.encoding )(sys.stdout)

output.write( text )
output.close

## short_shuffle.py
import random

l = list( "Even a short sentence is given away by frequency counting, this is how cryptogrpahers break messages" )

random.shuffle( l )

print "".join( l )
	Some Python scripts used in exploration of the Pigeon Code, a WW2 era
	UK code found attached to the leg of a dead pigeon in December 2012.

	There is a lot of shared code between these scripts, and they are of, er,
	varying quality as I knocked them up on a Saturday afternoon for the sole
	purpose of hacking away on the Pigeon Code.

	Shared in the spirit of making my rather limited investigation repeatable.

	See the associated blog post at http://www.enigmaticape.com/blog/pigeon-code-some-idle-speculation-with-graphs/
	import os
	import sys
	import codecs
	import argparse

	parser = argparse.ArgumentParser("Count frequency of letters in ciphertext")

	parser.add_argument("-i", "--input" , help="input file",
	action="store")

	parser.add_argument("-o", "--output", help="output file",
	action="store")

	parser.add_argument("-e", "--encoding", help="utf-8 \| ascii",
	action="store",
	default="utf-8")

	parser.add_argument("-l", "--locale", help="locale name",
	action="store")

	# NB that if you do use additional symbols, you are going
	# to have no fun at all if one of them is a comma.
	# this issue will be addressed in a future update
	parser.add_argument("-a", "--addsyms", help="additional symbols",
	action="store")

	args = parser.parse_args()

	if args.addsyms == None:
	args.addsyms = []

	# determine whether to read input from a file or STDIN
	if args.input :
	# file
	if not os.path.exists( args.input ):
	sys.stderr.write( args.input + "\nFile not found :-( \n" )
	sys.exit()

	cipher = codecs.open( args.input, 'rb', encoding=args.encoding )
	else:
	if sys.stdin.isatty():
	cipher = codecs.getreader( sys.stdin.encoding )(sys.stdin)
	else:
	cipher = codecs.getreader( args.encoding )(sys.stdin)


	# read the file
	cipher_text = cipher.read()

	# count letters
	counts = {}
	for char in cipher_text.upper():
	if char.isalpha() or char in args.addsyms:
	if char in counts:
	counts[ char ] += 1
	else:
	counts[ char ] = 1


	labels, data = [
	[ tuple[ i ] for tuple in sorted(counts.items()) ]
	for i in (0,1)
	]

	# determine if we're using stdout or an output file

	if args.output :
	output = codecs.open( args.output, "w", args.encoding )
	else:
	# if we're using stdout, there are some pesky encoding issues
	# to deal with. We default to UTF 8. Your TTY should be UTF8.
	if sys.stdout.isatty():
	output = codecs.getwriter( sys.stdout.encoding )(sys.stdout)
	else:
	output = codecs.getwriter( args.encoding )(sys.stdout)


	# write the data out on two lines, basically as a CSV
	output.write( ",".join(labels) +"\n" )
	output.write( ",".join(str(d) for d in data) +"\n" )
	output.close
	# Given a two line input of comma seperated values,
	# draws a bar chart assuming the first line is labels
	# and the second line is counts.

	# Requires reportlab and PIP

	import sys
	import os


	# easy_install PIP
	# easy_install reportlab
	from reportlab.graphics.shapes import Drawing
	from reportlab.graphics.charts.barcharts import VerticalBarChart



	import codecs
	import argparse

	parser = argparse.ArgumentParser("Draw a histogram")

	parser.add_argument("-i", "--input" , help="input file",
	action="store")

	parser.add_argument("-o", "--output", help="output file base name",
	action="store",
	default="histo")

	parser.add_argument("-e", "--encoding", help="utf-8 \| ascii",
	action="store",
	default="utf-8")

	parser.add_argument("-s", "--step", help="value axis step",
	action="store",
	default=1)

	args = parser.parse_args()

	# determine whether to read input from a file or STDIN
	if args.input :
	# file
	if not os.path.exists( args.input ):
	sys.stderr.write( args.input + "\nFile not found :-( \n" )
	sys.exit()

	histo_file = codecs.open( args.input, 'rb', encoding=args.encoding )
	else:
	if sys.stdin.isatty():
	histo_file = codecs.getreader( sys.stdin.encoding )(sys.stdin)
	else:
	histo_file = codecs.getreader( args.encoding )(sys.stdin)


	label_data = histo_file.readline()
	data_data = histo_file.readline()

	labels = label_data.strip().split(",")
	data = [int(i) for i in data_data.strip().split(",")]

	drawing = Drawing(600, 500)
	chart = VerticalBarChart()
	chart.width = 560
	chart.height = 460
	chart.x = 20
	chart.y = 20

	chart.data = [data]
	chart.categoryAxis.categoryNames = labels
	chart.valueAxis.valueMin = 0
	chart.valueAxis.valueStep = int( args.step )
	drawing.add( chart )
	drawing.save( fnRoot= args.output, formats=['png'] )
	# scrape textual data from a webpage and write it
	# to a file or stdout

	import os
	import sys
	import codecs
	import argparse

	# easy_install lxml
	from lxml import html
	from lxml.html.clean import clean_html


	# process command line args
	parser = argparse.ArgumentParser("Scrape text from a url")

	parser.add_argument("url", help = "http://some.website.com",
	action="store")

	parser.add_argument("-o", "--output", help="output file",
	action="store")

	parser.add_argument("-e", "--encoding", help="utf-8 \| ascii",
	default="utf-8",
	action="store")

	parser.add_argument("-l", "--locale", help="locale name",
	action="store")

	args = parser.parse_args()

	# set the locale if one was provided
	if args.locale :
	locale.setlocale( locale.LC_ALL, args.locale )

	# get and clean the HTML
	tree = html.parse( args.url )
	tree = clean_html( tree )
	text = tree.getroot().text_content()

	# determine if we're using stdout or an output file

	if args.output :
	output = codecs.open( args.output, "w", args.encoding )
	else:
	# if we're using stdout, there are some pesky encoding issues
	# to deal with. We default to UTF 8. Your TTY should be UTF8.
	if sys.stdout.isatty():
	output = codecs.getwriter( sys.stdout.encoding )(sys.stdout)
	else:
	output = codecs.getwriter( args.encoding )(sys.stdout)

	output.write( text )
	output.close
	import random

	l = list( "Even a short sentence is given away by frequency counting, this is how cryptogrpahers break messages" )

	random.shuffle( l )

	print "".join( l )