''' | |
Non-parametric computation of entropy and mutual-information | |
Adapted by G Varoquaux for code created by R Brette, itself | |
from several papers (see in the code). | |
This code is maintained at https://github.com/mutualinfo/mutual_info | |
Please download the latest code there, to have improvements and | |
bug fixes. | |
These computations rely on nearest-neighbor statistics | |
''' | |
import numpy as np | |
from scipy.special import gamma,psi | |
from scipy import ndimage | |
from scipy.linalg import det | |
from numpy import pi | |
from sklearn.neighbors import NearestNeighbors | |
__all__=['entropy', 'mutual_information', 'entropy_gaussian'] | |
EPS = np.finfo(float).eps | |
def nearest_distances(X, k=1): | |
''' | |
X = array(N,M) | |
N = number of points | |
M = number of dimensions | |
returns the distance to the kth nearest neighbor for every point in X | |
''' | |
knn = NearestNeighbors(n_neighbors=k + 1) | |
knn.fit(X) | |
d, _ = knn.kneighbors(X) # the first nearest neighbor is itself | |
return d[:, -1] # returns the distance to the kth nearest neighbor | |
def entropy_gaussian(C): | |
''' | |
Entropy of a gaussian variable with covariance matrix C | |
''' | |
if np.isscalar(C): # C is the variance | |
return .5*(1 + np.log(2*pi)) + .5*np.log(C) | |
else: | |
n = C.shape[0] # dimension | |
return .5*n*(1 + np.log(2*pi)) + .5*np.log(abs(det(C))) | |
def entropy(X, k=1): | |
''' Returns the entropy of the X. | |
Parameters | |
=========== | |
X : array-like, shape (n_samples, n_features) | |
The data the entropy of which is computed | |
k : int, optional | |
number of nearest neighbors for density estimation | |
Notes | |
====== | |
Kozachenko, L. F. & Leonenko, N. N. 1987 Sample estimate of entropy | |
of a random vector. Probl. Inf. Transm. 23, 95-101. | |
See also: Evans, D. 2008 A computationally efficient estimator for | |
mutual information, Proc. R. Soc. A 464 (2093), 1203-1215. | |
and: | |
Kraskov A, Stogbauer H, Grassberger P. (2004). Estimating mutual | |
information. Phys Rev E 69(6 Pt 2):066138. | |
''' | |
# Distance to kth nearest neighbor | |
r = nearest_distances(X, k) # squared distances | |
n, d = X.shape | |
volume_unit_ball = (pi**(.5*d)) / gamma(.5*d + 1) | |
''' | |
F. Perez-Cruz, (2008). Estimation of Information Theoretic Measures | |
for Continuous Random Variables. Advances in Neural Information | |
Processing Systems 21 (NIPS). Vancouver (Canada), December. | |
return d*mean(log(r))+log(volume_unit_ball)+log(n-1)-log(k) | |
''' | |
return (d*np.mean(np.log(r + np.finfo(X.dtype).eps)) | |
+ np.log(volume_unit_ball) + psi(n) - psi(k)) | |
def mutual_information(variables, k=1): | |
''' | |
Returns the mutual information between any number of variables. | |
Each variable is a matrix X = array(n_samples, n_features) | |
where | |
n = number of samples | |
dx,dy = number of dimensions | |
Optionally, the following keyword argument can be specified: | |
k = number of nearest neighbors for density estimation | |
Example: mutual_information((X, Y)), mutual_information((X, Y, Z), k=5) | |
''' | |
if len(variables) < 2: | |
raise AttributeError( | |
"Mutual information must involve at least 2 variables") | |
all_vars = np.hstack(variables) | |
return (sum([entropy(X, k=k) for X in variables]) | |
- entropy(all_vars, k=k)) | |
def mutual_information_2d(x, y, sigma=1, normalized=False): | |
""" | |
Computes (normalized) mutual information between two 1D variate from a | |
joint histogram. | |
Parameters | |
---------- | |
x : 1D array | |
first variable | |
y : 1D array | |
second variable | |
sigma: float | |
sigma for Gaussian smoothing of the joint histogram | |
Returns | |
------- | |
nmi: float | |
the computed similariy measure | |
""" | |
bins = (256, 256) | |
jh = np.histogram2d(x, y, bins=bins)[0] | |
# smooth the jh with a gaussian filter of given sigma | |
ndimage.gaussian_filter(jh, sigma=sigma, mode='constant', | |
output=jh) | |
# compute marginal histograms | |
jh = jh + EPS | |
sh = np.sum(jh) | |
jh = jh / sh | |
s1 = np.sum(jh, axis=0).reshape((-1, jh.shape[0])) | |
s2 = np.sum(jh, axis=1).reshape((jh.shape[1], -1)) | |
# Normalised Mutual Information of: | |
# Studholme, jhill & jhawkes (1998). | |
# "A normalized entropy measure of 3-D medical image alignment". | |
# in Proc. Medical Imaging 1998, vol. 3338, San Diego, CA, pp. 132-143. | |
if normalized: | |
mi = ((np.sum(s1 * np.log(s1)) + np.sum(s2 * np.log(s2))) | |
/ np.sum(jh * np.log(jh))) - 1 | |
else: | |
mi = ( np.sum(jh * np.log(jh)) - np.sum(s1 * np.log(s1)) | |
- np.sum(s2 * np.log(s2))) | |
return mi | |
############################################################################### | |
# Tests | |
def test_entropy(): | |
# Testing against correlated Gaussian variables | |
# (analytical results are known) | |
# Entropy of a 3-dimensional gaussian variable | |
rng = np.random.RandomState(0) | |
n = 50000 | |
d = 3 | |
P = np.array([[1, 0, 0], [0, 1, .5], [0, 0, 1]]) | |
C = np.dot(P, P.T) | |
Y = rng.randn(d, n) | |
X = np.dot(P, Y) | |
H_th = entropy_gaussian(C) | |
H_est = entropy(X.T, k=5) | |
# Our estimated entropy should always be less that the actual one | |
# (entropy estimation undershoots) but not too much | |
np.testing.assert_array_less(H_est, H_th) | |
np.testing.assert_array_less(.9*H_th, H_est) | |
def test_mutual_information(): | |
# Mutual information between two correlated gaussian variables | |
# Entropy of a 2-dimensional gaussian variable | |
n = 50000 | |
rng = np.random.RandomState(0) | |
#P = np.random.randn(2, 2) | |
P = np.array([[1, 0], [0.5, 1]]) | |
C = np.dot(P, P.T) | |
U = rng.randn(2, n) | |
Z = np.dot(P, U).T | |
X = Z[:, 0] | |
X = X.reshape(len(X), 1) | |
Y = Z[:, 1] | |
Y = Y.reshape(len(Y), 1) | |
# in bits | |
MI_est = mutual_information((X, Y), k=5) | |
MI_th = (entropy_gaussian(C[0, 0]) | |
+ entropy_gaussian(C[1, 1]) | |
- entropy_gaussian(C) | |
) | |
# Our estimator should undershoot once again: it will undershoot more | |
# for the 2D estimation that for the 1D estimation | |
print((MI_est, MI_th)) | |
np.testing.assert_array_less(MI_est, MI_th) | |
np.testing.assert_array_less(MI_th, MI_est + .3) | |
def test_degenerate(): | |
# Test that our estimators are well-behaved with regards to | |
# degenerate solutions | |
rng = np.random.RandomState(0) | |
x = rng.randn(50000) | |
X = np.c_[x, x] | |
assert np.isfinite(entropy(X)) | |
assert np.isfinite(mutual_information((x[:, np.newaxis], | |
x[:, np.newaxis]))) | |
assert 2.9 < mutual_information_2d(x, x) < 3.1 | |
def test_mutual_information_2d(): | |
# Mutual information between two correlated gaussian variables | |
# Entropy of a 2-dimensional gaussian variable | |
n = 50000 | |
rng = np.random.RandomState(0) | |
#P = np.random.randn(2, 2) | |
P = np.array([[1, 0], [.9, .1]]) | |
C = np.dot(P, P.T) | |
U = rng.randn(2, n) | |
Z = np.dot(P, U).T | |
X = Z[:, 0] | |
X = X.reshape(len(X), 1) | |
Y = Z[:, 1] | |
Y = Y.reshape(len(Y), 1) | |
# in bits | |
MI_est = mutual_information_2d(X.ravel(), Y.ravel()) | |
MI_th = (entropy_gaussian(C[0, 0]) | |
+ entropy_gaussian(C[1, 1]) | |
- entropy_gaussian(C) | |
) | |
print((MI_est, MI_th)) | |
# Our estimator should undershoot once again: it will undershoot more | |
# for the 2D estimation that for the 1D estimation | |
np.testing.assert_array_less(MI_est, MI_th) | |
np.testing.assert_array_less(MI_th, MI_est + .2) | |
if __name__ == '__main__': | |
# Run our tests | |
test_entropy() | |
test_mutual_information() | |
test_degenerate() | |
test_mutual_information_2d() |
This comment has been minimized.
This comment has been minimized.
Thank you for your work. Just one quick comment about the nearest_distances function. --g |
This comment has been minimized.
This comment has been minimized.
I used GaelVaroquaux's code above with the boston data, computing mutual information between each feature and the target, see here. |
This comment has been minimized.
This comment has been minimized.
I noticed two people mentioned computing negative mutual information. I haven't taken a close look at this implementation or the papers referenced, but A Study on Mutual Information-based Feature Selection for Text Categorization @ https://core.ac.uk/download/pdf/11310215.pdf addresses a potential issue. |
This comment has been minimized.
This comment has been minimized.
Reading comments, using this function seems a bit worrying negative MI ? |
This comment has been minimized.
This comment has been minimized.
If I use |
This comment has been minimized.
This comment has been minimized.
Estimates of mutual information can result in negative mutual informations due to sampling errors, and potential violation in the assumption of density estimation using neighbor information. The assumption being that sample rate is high enough for point density to be locally uniform around each point. One approach to overcome this is to bootstrap estimate the effect of the bias by estimating mutual information from multiple sample sizes and extrapolating to an infinite sample size. |
This comment has been minimized.
This comment has been minimized.
This comment has been minimized.
This comment has been minimized.
What if I want the MI of two sets that don't have the same amount of samples? like X.shape = (30, 10) and Y.shape = (1,10). My impression is that the np.hstack in line 103 is not enough. Cool code btw. |
This comment has been minimized.
This comment has been minimized.
I agree. according to the original paper, http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.422.5607&rep=rep1&type=pdf, this should calculate the Euclidean distances to the k-nn of xi in X \xi. |
This comment has been minimized.
This comment has been minimized.
I don't really see how it would be possible to compute non-parametric mutual information between two sets where there is no correspondence between the samples. By construction to estimate MI one needs this correspondence. |
This comment has been minimized.
This comment has been minimized.
I agree that using n_neighbors=k gives the k-1 nearest neighbor, as the first is the point itself. The code was wrong. Maybe that error was introduced in an evolution of scikit-learn. I've fixed it. |
This comment has been minimized.
This comment has been minimized.
Hello friends thanks |
This comment has been minimized.
This comment has been minimized.
I've noticed that scaling samples changes the output. I suspect this is a small bug in the scaling somewhere. Will post back if I trace it down. Compare the following:
UPDATE: I think this is just the result of using a differential entropy throughout. Probably (not sure yet) but using some copula style normalization (see pd.rank for example) is a decent way of getting something invariant to some transformations. Might destroy some structure depending on the use case though. Also, there are few forks kicking around now. I think I've got one that fixed some small issue with the case of mutual information with itself. https://gist.github.com/cottrell/770b811c03c846386b19e9c4bd18772f/revisions I'm wondering if we should try to coral everyone together and make a repo. Viewing diffs is a little less convenient from the gist. Though I like the single discussion thread. |
This comment has been minimized.
This comment has been minimized.
I cannot comment on your potential bug (I'd have to dig back in this
code, and it's been many years).
However, I agree that this code should be consolidated somewhere. Maybe
it has been integrated in the library (I haven't followed). If not, if
there is something simple that I can do to help (such as pointing to a
repository), I would be more than happy.
Cheers!
|
This comment has been minimized.
This comment has been minimized.
Thank you for the work, Assume that I have a node with some features(f1,f2,f3). And another nodes from features (f1',f2',f3'). how can I find the mutual information of the two? |
This comment has been minimized.
This comment has been minimized.
Good question |
This comment has been minimized.
This comment has been minimized.
I would assume the natural result would be -6*(1/6)*log(1/6)=log(6) = 1.79175946923 . |
This comment has been minimized.
This comment has been minimized.
I noted two errors in this code (based on the reference paper from Krasov), and this also fixed the negative MI values I was getting.
volume_unit_ball = 1 |
This comment has been minimized.
This comment has been minimized.
@thismartion I haven't yet found it in another library where it should probably sit. Have created this and added you and @GaelVaroquaux . You should have admin privileges if I did it right. |
This comment has been minimized.
This comment has been minimized.
Creating a library for collective maintenance was really the right move. I'll update the gist to point there. |
This comment has been minimized.
This comment has been minimized.
I've copy pasta'd your change into this branch. https://github.com/mutualinfo/mutual_info/tree/thismartian There is some test failure but I haven't looked at it yet. I suspect if you still have it in your head it is an easy test change. I'm not ever sure what is correct since it's been a few months since I've used this stuff. |
This comment has been minimized.
This comment has been minimized.
Hi, cottrell. Thanks a lot for this. I figured out what the problem is. The distance used to calculate the entropy should be 2x the distance to the nearest neighbor. Not sure I'm doing it right but I don't seem to have the permission to make changes to the file, perhaps you could try this: in the entropy function: return d * np.mean(np.log(2*r + np.finfo(X.dtype).eps)) + np.log(volume_unit_ball) + psi(n) - psi(k) |
This comment has been minimized.
This comment has been minimized.
Hmmm, I can past it in and try but let's try to fix the permissions. Having a look now ... |
This comment has been minimized.
This comment has been minimized.
Ok, I think I've now added @thismartian and @GaelVaroquaux as admins on the org and the repo now. For some reason it only sets you up as members initially when I punched them in. Let me know if you can't access it. |
This comment has been minimized.
This comment has been minimized.
@thismartian The tests now pass after I pasted the change in. It's still just on that branch if you want to review it. If there is an obvious test that would have failed before your change and passed after, maybe we can add that? |
This comment has been minimized.
This comment has been minimized.
@cottrell the changes look fine. I think the entropy test was failing (as expected). Is that what you mean? Sorry it's not clear to me... |
This comment has been minimized.
This comment has been minimized.
No I mean before the change, no tests were failing but you noticed something was wrong. What is a missing test that would have shown something was wrong? |
This comment has been minimized.
This comment has been minimized.
Ah, for me the main indication was just that for some random combinations the mutual info was negative (as you had pointed out earlier too). I think in the test itself, if you change the covariance matrix to unit variance, it shows that the test failed. Honestly, I am not sure I understand how the covariance matrix is generated in the tests (I need to read about it). What I would do is: define a correlation coefficient (r), standard deviation (s), and based on these two it's possible to have a unit variance, zero mean covariance matrix: `
|
This comment has been minimized.
This comment has been minimized.
I was probably not clear, let me try to rephrase:
|
This comment has been minimized.
This comment has been minimized.
Thanks @thismartian I will take a look a try to jam a test in shortly. |
This comment has been minimized.
This comment has been minimized.
Hi @cottrell, did you have time to test the changes from thismartian
branch? I am pretty sure they are correct, tested them also against scikit
learn's mutual info.
I have not looked into mutual_info_2d at all though. Will try to find some
time soon.
…On Fri, Feb 12, 2021 at 10:12 AM David Cottrell ***@***.***> wrote:
***@***.**** commented on this gist.
------------------------------
Thanks @thismartian <https://github.com/thismartian> I will take a look a
try to jam a test in shortly.
—
You are receiving this because you were mentioned.
Reply to this email directly, view it on GitHub
<https://gist.github.com/ead9898bd3c973c40429#gistcomment-3628901>, or
unsubscribe
<https://github.com/notifications/unsubscribe-auth/AF67PHIYNEIFV2J7O4SIDPDS6TWHBANCNFSM4HMIHWPQ>
.
|
This comment has been minimized.
This comment has been minimized.
@thismartian Yeah I can't remember where I left it. I think I put some new tests in and have a branch there. I think I was looking at differential entropy in general and how it sort of has a dimensionality issue. The branch should run and we could just merge. Likely I'll dig into it more next time I am looking for entropy of continuous random variables. |
This comment has been minimized.
Hi Gael,
Thanks a lot for these functions! When I run the test_mutual_information() I get:
-0.136308887598 and 0.111571775657
I get it that the estimator should undershoot, but can MI be negative? I'm trying to use this function to implement the Joint Mutual Information feature selection method:
Data Visualization and Feature Selection: New Algorithms for Nongaussian Data
H. Yang and J. Moody, NIPS (1999)
This method performed best out of many information theoretic filter methods:
Conditional Likelihood Maximisation: A Unifying Framework for Information
Theoretic Feature Selection G. Brown, A. Pocock, M.-J.Zhao, M. Lujan
Journal of Machine Learning Research, 13:27-66 (2012)
C Implementation: https://github.com/Craigacp/FEAST/blob/master/JMI.c
When I'm trying to estimate the joint mutual information of two features with y, so I(X1, X2; Y), where X1 and X2 are two columns of X, and Y is the response variable, I get a positive value only for k=1, but as soon as I increase the size of the neighborhood the MI goes negative.. I'm only using the MI measure to rank features relative to each other, so if this method can be trusted, I don't really care if it's negative.. What do you think?
Thanks a lot,
Daniel