Created
April 21, 2012 12:09
-
-
Save domoritz/2436815 to your computer and use it in GitHub Desktop.
Get number of unique characters in a file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
This prints the number of unique characters from the input. Input can either be a command line argument or stdin. | |
Works with python 2.7 and 3.2. | |
Use python unique_chars.py -h to get more information. | |
""" | |
import collections | |
import argparse | |
import sys | |
import math | |
from curses.ascii import isprint | |
from argparse import RawTextHelpFormatter | |
def H(data): | |
""" | |
calculates the entropy from a dictionary with the data and the counts | |
""" | |
v = data.values() | |
probs = [float(count) / sum(v) for count in v] | |
return -sum([p * math.log(p, 2) for p in probs]) | |
def main(): | |
parser = argparse.ArgumentParser(description='Count unique characters from a file or stdin.', formatter_class=RawTextHelpFormatter) | |
parser.add_argument('input', default=sys.stdin, nargs='?', type=argparse.FileType('r'), help='file with characters') | |
parser.add_argument('-t', '--task', dest='task', choices=['print', 'sum', 'count', 'entropy'], default='print', | |
help='\n'.join([ | |
'print = print characters with number of occurences (default)', | |
'sum = sum up the number of characters', | |
'count = count number of unique characters', | |
'entropy = calculate entropy for binary representation of one char'])) | |
parser.add_argument('-i', '--ignore-last-newline', dest='ignore_nl', action='store_true', help='ignore newline at the end of file') | |
parser.add_argument('-p', '--printable', dest='printable', action='store_true', help='ignore non printable characters') | |
args = parser.parse_args() | |
text = args.input.read() | |
if args.printable: | |
text = ''.join(char for char in text if isprint(char)) | |
if args.ignore_nl: | |
text = text.strip('\n') | |
d = collections.Counter(text) | |
if args.task == 'sum': | |
print(sum(d.values())) | |
elif args.task == 'count': | |
print(len(d)) | |
elif args.task == 'entropy': | |
print(H(d)) | |
elif args.task == 'print': | |
for c in sorted(d, key=d.get, reverse=True): | |
print("Character: %(char)c: %(#)d" % {'char': c, '#': d[c]}) | |
else: | |
raise Exception('should not be reached') | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment