Created March 27, 2014 18:21
Python scrupt that performs basic tests (derived from ent utility) on a binary file and also produces both a histogram of the distribution of bytes as well as a scatter plot of those byte pairs. Useful for preliminary testing of quality of entropy (random) data sources.
#! /usr/bin/python
# This program will take a file name from the command line and analyze its entropy, using many of the same algorithms
# as the ent program from hotbits
import sys
import struct
import math
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import scipy.stats as stats
# This array contains the number of 1's contained in each byte value; 0-255
ones = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,
byte2bin = ["00000000","00000001","00000010","00000011","00000100","00000101","00000110","00000111",
def calcscc( dt_array, tc ):
sccfirst = 1 # Mark first time for serial correlation
scct1 = scct2 = scct3 = sccun = sccu0 = 0.0 # Clear serial correlation terms
dt_size = len(dt_array)
for idx in range(dt_size):
sccun = dt_array[idx] + 0.0
if (sccfirst):
sccfirst = 0
scclast = 0
sccu0 = sccun
scct1 = scct1 + scclast * sccun
scct2 = scct2 + sccun
scct3 = scct3 + (sccun * sccun)
scclast = sccun
scct1 = scct1 + scclast * sccu0;
scct2 = scct2 * scct2
scc = tc * scct3 - scct2
if (scc == 0.0):
scc = -100000
scc = (tc * scct1 - scct2) / scc
return (scc)
def calcent( hist_array, tc ):
ent = 0.0
for idx in range(256):
prob = hist_array[idx] / (tc * 1.0)
if (prob > 0.0):
ent += prob * math.log((1/prob),2)
return (ent)
def ent_bytes( original_array, hist_array ):
bitsRead = 0
totalOnes = 0
totalc = 0
for idx in range(256):
totalc += hist_array[idx]
totalOnes += hist_array[idx]*ones[idx]
bitsRead += hist_array[idx]*8
mean = totalOnes / float(bitsRead)
cexp = totalc / 256.0
chisq = 0.0
datasum = 0
for idx in range(256):
a = hist_array[idx] - cexp
chisq += (a * a) / cexp
datasum += idx * hist_array[idx]
entropy = calcent(hist_array, totalc)
compression = ((8-entropy)/8)
chisqProbability = 1.0 - stats.distributions.chi2.cdf(chisq, 255)
serCorCoef = calcscc(original_array,totalc)
arithmeticMean = datasum/(totalc*1.0)
return({'bitsRead': bitsRead, 'totalOnes': totalOnes, 'totalc': totalc, 'cexp': cexp, 'chisq': chisq, 'entropy': entropy, 'compression': compression, 'chisqProbability': chisqProbability, 'serCorCoef': serCorCoef, 'arithmeticMean': arithmeticMean})
filename = sys.argv[1]
print "Must provide a filename to process! ./ filename"
data = np.fromfile(filename,dtype=np.uint8)
fig = plt.figure(figsize=(8,10), dpi=100)
ax = fig.add_subplot(211)
n, bins, patches = ax.hist(data, bins=256)
ax.set_xlabel('Byte Values')
ax.set_title('Histogram of '+filename)
#fn = filename + '.hist.png'
b = np.reshape(data[:len(data ) - len(data)%2], (-1, 2))
bx = fig.add_subplot(212)
bx.set_xlabel('Byte Values')
bx.set_ylabel('Byte Values')
bx.set_title('Scatter Plot of '+filename)
fn = filename + '.png'
entropy = ent_bytes( data, n )
totalZeroes = entropy['bitsRead'] - entropy['totalOnes']
totalZeroesPercent = (entropy['bitsRead'] - entropy['totalOnes'])/(entropy['bitsRead']*1.0)
totalOnesPercent = (entropy['totalOnes']/(entropy['bitsRead']*1.0))
totalPercent = (entropy['bitsRead']/(entropy['totalc']*8.0))
print " "
print "Value Char Occurrences Fraction"
print "{:4d} {:11d} {:12.10f}".format(0,totalZeroes,totalZeroesPercent)
print "{:4d} {:11d} {:12.10f}".format(1,entropy['totalOnes'],totalOnesPercent)
print "Total: {:11d} {:12.10f}".format(entropy['bitsRead'],totalPercent)
print " "
print " "
print "Value Char Occurrences Fraction Expectation Deviation"
cumdev = 0.0
for idx in range(256):
cumdev += math.fabs(n[idx]-(entropy['totalc']/256.0))
print "{:4d} {:11,d} {:12.9%} {:11,.2f} {:12,.4f}".format(idx,n[idx],(n[idx]/(entropy['totalc']*1.0)),(entropy['totalc']/256.0),math.fabs(n[idx]-(entropy['totalc']/256.0)))
print "Total: {:11,d} {:12.7%} Mean ={:13,.4f}".format(entropy['totalc'],(entropy['totalc']/(entropy['totalc']*1.0)),(cumdev/256.0))
print " "
print "Entropy = {:8.6f} bits per byte.".format(entropy['entropy'])
print " "
print "Optimum compression would reduce the size"
print "of this {:,} byte file by {:.2%}".format(entropy['totalc'],entropy['compression'])
print " "
print "Chi square distribution for {:,} samples is {:.2f}, and randomly".format(entropy['totalc'],entropy['chisq'])
print "would exceed this value {:.2%} percent of the time.".format(entropy['chisqProbability'])
print " "
print "Arithetic mean value of data bytes is {:.4f} (127.5 = random)".format(entropy['arithmeticMean'])
print "Serial correlation coefficient is {:.6f} (totally uncorrelated = 0.0).".format(entropy['serCorCoef'])
print "Failed!"
