Skip to content

Instantly share code, notes, and snippets.

@kevhill
Last active July 8, 2017 20:55
Show Gist options
  • Save kevhill/9a38bd382847b86950f421ff4a981cc5 to your computer and use it in GitHub Desktop.
Save kevhill/9a38bd382847b86950f421ff4a981cc5 to your computer and use it in GitHub Desktop.
I ran into a barrier trying to cross correlate two matrices with missing values. I had to work our how to do that in a memory/cpu efficient way, and this is the best I could come up with.
import numpy as np
import copy
def nan_cov(X, preserve_input=True):
# we need to mutate input to fill nan's with 0's
# if we need to use the input later, we need to use a copy now
if preserve_input:
X = copy.deepcopy(X)
# note the nan values, but then fill them with 0s
nanX = np.isnan(X)
X[nanX] = 0
# masking matrix indicating where we have good data
iX = (~nanX).astype(int)
# sum of each row when filtered through valid measurements in each pair
# note that if one of the values going into the sum was nan, it is now 0 and
# will not contribute to the sum
sX = np.dot(X, iX.T)
# count of total matches
cX = np.dot(iX, iX.T)
# mean is just sum / count
uX = sX / cX
return (np.dot(X, X.T) / cX) - uX * uX.T
def nan_cor(X, preserve_input=True):
# we need to mutate input to fill nan's with 0's
# if we need to use the input later, we need to use a copy now
if preserve_input:
X = copy.deepcopy(X)
# note the nan values, but then fill them with 0s
nanX = np.isnan(X)
X[nanX] = 0
# masking matrix indicating where we have good data
iX = (~nanX).astype(int)
# sum of each row when filtered through valid measurements in each pair
# note that if one of the values going into the sum was nan, it is now 0 and
# will not contribute to the sum
sX = np.dot(X, iX.T)
# count of total matches
cX = np.dot(iX, iX.T)
# mean is just sum / count
uX = sX / cX
# compute covariance matrix
cov = ((np.dot(X, X.T) / cX) - uX * uX.T)
# Variance for each row when filtered through valid measures in the pair
vX = (np.dot(X ** 2, iX.T) / cX - uX ** 2)
# normalize xcov by the sqrt of product of the variances
return cov / (vX * vX.T) ** .5
def nan_xcov(A, B, preserve_input=True):
# we need to mutate A and B to fill nan's with 0's
# if we need to use the input later, we need to use a copy now
if preserve_input:
A = copy.deepcopy(A)
B = copy.deepcopy(B)
# note the nan values, but then fill them with 0s
nanA = np.isnan(A)
A[nanA] = 0
nanB = np.isnan(B)
B[nanB] = 0
# masking matrices indicating where we have good data
iA = (~nanA).astype(int)
iB = (~nanB).astype(int)
# this is the number of valid matches from A -> B (c.T == matches B -> A)
c = np.dot(iA, iB.T)
# compute the cross covariance between the two matrices, filtered for good values
return (np.dot(A, B.T) - (np.dot(A, iB.T) * np.dot(iA, B.T) / c)) / c
def nan_xcor(A, B, preserve_input=True):
# we need to mutate A and B to fill nan's with 0's
# if we need to use the input later, we need to use a copy now
if preserve_input:
A = copy.deepcopy(A)
B = copy.deepcopy(B)
# note the nan values, but then fill them with 0s
nanA = np.isnan(A)
A[nanA] = 0
nanB = np.isnan(B)
B[nanB] = 0
# masking matrices indicating where we have good data
iA = (~nanA).astype(int)
iB = (~nanB).astype(int)
# this is the number of valid matches from A -> B (c.T == matches B -> A)
c = np.dot(iA, iB.T)
# compute the cross covariance between the two matrices, filtered for good values
xcov = (np.dot(A, B.T) - (np.dot(A, iB.T) * np.dot(iA, B.T) / c)) / c
# Variance for each matrix when filtered through valid measures in the other
# in the case of vB we are really finding vB.t to put it in the same
# orientation as vA
vA = np.dot(A ** 2, iB.T) / c - (np.dot(A, iB.T) / c) ** 2
vB = np.dot(iA, B.T ** 2) / c - (np.dot(iA, B.T) / c) ** 2
# normalize xcov by the sqrt of product of the variances
return xcov / (vA * vB) ** .5
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment