Created
October 26, 2016 11:01
-
-
Save shagunsodhani/9a46fe803c8087dcde686b7e37e60567 to your computer and use it in GitHub Desktop.
Script to calculate entropy for any column in a file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Script to calculate entropy for any column in a file. | |
from __future__ import print_function | |
import numpy as np | |
def entropy(file_path, sep, col_index, col_name): | |
'''Method to calculate entropy for any col_index | |
in a file where columns are seperated by sep''' | |
distribution = np.asarray(list(read_column(file_path, sep, col_index))) | |
probs = [np.mean(distribution == c) for c in set(distribution)] | |
_entropy = np.sum(-p * np.log2(p) for p in probs) | |
print(_entropy) | |
print("Entropy for %s = %f" % (col_name, _entropy)) | |
def read_column(file_path, sep, col_index): | |
'''Method to read col_index column in a file where columns are seperated by | |
sep. It returns an iterator over the list of entries in the column''' | |
with open(file_path, "r") as f: | |
for line in f: | |
yield float(line.strip().split(sep)[col_index]) | |
file_path = "file_path" | |
sep = "\t" | |
col_index = 0 | |
col_name = "col_name" | |
entropy(file_path, sep, col_index, col_name) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment