Skip to content

Instantly share code, notes, and snippets.

@shagunsodhani
Created October 26, 2016 11:01
Show Gist options
  • Save shagunsodhani/9a46fe803c8087dcde686b7e37e60567 to your computer and use it in GitHub Desktop.
Save shagunsodhani/9a46fe803c8087dcde686b7e37e60567 to your computer and use it in GitHub Desktop.
Script to calculate entropy for any column in a file
# Script to calculate entropy for any column in a file.
from __future__ import print_function
import numpy as np
def entropy(file_path, sep, col_index, col_name):
'''Method to calculate entropy for any col_index
in a file where columns are seperated by sep'''
distribution = np.asarray(list(read_column(file_path, sep, col_index)))
probs = [np.mean(distribution == c) for c in set(distribution)]
_entropy = np.sum(-p * np.log2(p) for p in probs)
print(_entropy)
print("Entropy for %s = %f" % (col_name, _entropy))
def read_column(file_path, sep, col_index):
'''Method to read col_index column in a file where columns are seperated by
sep. It returns an iterator over the list of entries in the column'''
with open(file_path, "r") as f:
for line in f:
yield float(line.strip().split(sep)[col_index])
file_path = "file_path"
sep = "\t"
col_index = 0
col_name = "col_name"
entropy(file_path, sep, col_index, col_name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment