Skip to content

Instantly share code, notes, and snippets.

@domoritz
Created April 21, 2012 12:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save domoritz/2436815 to your computer and use it in GitHub Desktop.
Save domoritz/2436815 to your computer and use it in GitHub Desktop.
Get number of unique characters in a file.
#!/usr/bin/env python
"""
This prints the number of unique characters from the input. Input can either be a command line argument or stdin.
Works with python 2.7 and 3.2.
Use python unique_chars.py -h to get more information.
"""
import collections
import argparse
import sys
import math
from curses.ascii import isprint
from argparse import RawTextHelpFormatter
def H(data):
"""
calculates the entropy from a dictionary with the data and the counts
"""
v = data.values()
probs = [float(count) / sum(v) for count in v]
return -sum([p * math.log(p, 2) for p in probs])
def main():
parser = argparse.ArgumentParser(description='Count unique characters from a file or stdin.', formatter_class=RawTextHelpFormatter)
parser.add_argument('input', default=sys.stdin, nargs='?', type=argparse.FileType('r'), help='file with characters')
parser.add_argument('-t', '--task', dest='task', choices=['print', 'sum', 'count', 'entropy'], default='print',
help='\n'.join([
'print = print characters with number of occurences (default)',
'sum = sum up the number of characters',
'count = count number of unique characters',
'entropy = calculate entropy for binary representation of one char']))
parser.add_argument('-i', '--ignore-last-newline', dest='ignore_nl', action='store_true', help='ignore newline at the end of file')
parser.add_argument('-p', '--printable', dest='printable', action='store_true', help='ignore non printable characters')
args = parser.parse_args()
text = args.input.read()
if args.printable:
text = ''.join(char for char in text if isprint(char))
if args.ignore_nl:
text = text.strip('\n')
d = collections.Counter(text)
if args.task == 'sum':
print(sum(d.values()))
elif args.task == 'count':
print(len(d))
elif args.task == 'entropy':
print(H(d))
elif args.task == 'print':
for c in sorted(d, key=d.get, reverse=True):
print("Character: %(char)c: %(#)d" % {'char': c, '#': d[c]})
else:
raise Exception('should not be reached')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment