Skip to content

Instantly share code, notes, and snippets.

@TheRockStarDBA
Forked from notareverser/histogram.py
Created July 7, 2022 18:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TheRockStarDBA/dc0423fbf49322927ed05bf6b32e2d37 to your computer and use it in GitHub Desktop.
Save TheRockStarDBA/dc0423fbf49322927ed05bf6b32e2d37 to your computer and use it in GitHub Desktop.
Frequency analysis tool
#!/usr/bin/env python3
import argparse
import sys
import mmap
import logging
from collections import defaultdict
logging.basicConfig( level=logging.ERROR,
format='%(asctime)s %(levelname)-8s %(message)s',
datefmt='%Y-%m-%dT%H:%M:%S',
handlers={logging.StreamHandler(sys.stderr)})
def convertToNumber(values, endianness):
result = 0
if endianness == 'big': values = values[::-1]
for v in values: result = (result<<8)+v
return result
def produceFrequencies(filename, args):
frequencies = defaultdict(int)
# probably need to support giant files
__fileHandle = open(filename, 'rb')
fileHandle = mmap.mmap(__fileHandle.fileno(), 0, access=mmap.PROT_READ)
fileSize = fileHandle.size()
ngramSize = args.size
numValues = int(fileSize/ngramSize)
remainder = fileSize % ngramSize
if remainder != 0: logging.warning('Ignoring {:d} bytes at the end'.format(remainder))
for x in range(numValues):
nextBytes = fileHandle[(x*ngramSize):((x+1)*ngramSize)]
nextVal = convertToNumber(nextBytes, args.endianness)
frequencies[nextVal] += 1
values = frequencies.items()
if args.sort_values:
values = sorted(values, key=lambda x: x[1], reverse=True)
outputPadder = ngramSize*2
outputType = None
if args.display_type == 'hex': outputType = 'x'
elif args.display_type == 'decimal': outputType = 'd'
else: logging.error("Invalid output type, defaulting to hex")
outputFormatter = '{:0'+str(len(str(fileSize)))+'d} {:0'+'{:d}'.format(outputPadder)+outputType+'}'
for value, freq in values:
print(outputFormatter.format(freq, value))
def parseArguments():
parser = argparse.ArgumentParser(description="Arguments for script")
parser.add_argument('files', nargs='+')
parser.add_argument('-S', '--sort_values', action='store_true', default=False, help='If specified, sort the output by frequency')
parser.add_argument('-d', '--display_type', action='store', default='hex', choices=['hex', 'decimal'], help='Specify the output format (hex or decimal, NO OCTAL FOR YOU')
parser.add_argument('-s', '--size', action='store', type=int, default=1, help='Specifies the number of bytes to compute frequencies for')
parser.add_argument('-e', '--endianness', action='store', default='little', choices=['little', 'big'], help='Specify the endianness to compute multi-byte values (default is little endian)')
parser.add_argument('-v', '--verbose', action='store', default=None, help='If specified, output verbose input')
args = parser.parse_args()
if args.verbose != None:
newLevel = getattr(logging, args.verbose.upper(), None)
if isinstance(newLevel, int):
logging.getLogger().setLevel(newLevel)
return args
def main():
args = parseArguments()
for f in args.files:
produceFrequencies(f, args)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment