Skip to content

Instantly share code, notes, and snippets.

@rjs211
Last active March 5, 2020 08:12
Show Gist options
  • Save rjs211/4c128a593a8184f97c847196ea8a53ee to your computer and use it in GitHub Desktop.
Save rjs211/4c128a593a8184f97c847196ea8a53ee to your computer and use it in GitHub Desktop.
Compact Letter display (CLD) of all pairwise comparisons (similar to cld.r)
import numpy as np
def sweep(mat : np.ndarray):
"""
:param mat: The lmat generated after absorption phase , a np ndarray matrix of bool
:return: It returns the matrix after removing redundant entries
This function is pretty much a copy of sweep method from the corresponding R package https://github.com/cran/multcomp/blob/master/R/cld.R
"""
x = mat.copy()
while True:
locked = np.zeros(x.shape, dtype=np.bool) # True indicates that another letter dependes on this entry
flag = False
for i in range(x.shape[1]): # for each column
tmp = np.zeros(x.shape, dtype=np.bool)
rows_to_consider = np.where(x[:, i])[0] # get items of those rows which are TRUE in col "i"
tmp[rows_to_consider] = x[rows_to_consider]
one = rows_to_consider
flag = False
if np.any(np.sum(tmp, axis=1) == 1): # there is at least one row "l" where mat[l,i] is the only item which is TRUE i.e. no item can be removed in this column
continue
for j in rows_to_consider:
if locked[j][i]:
continue
chck = 0
lck = []
for k in rows_to_consider:
if j == k:
continue
rows = tmp[[j, k]]
dbl = rows[0] & rows[1] # true in more than 1 column (it is already true in column i)
hit = list(np.where(dbl)[0])
hit = [dum for dum in hit if dum != i]
if len(hit) > 0:
chck += 1
lck.append([(j, hit[-1]), (k, hit[-1])]) # record items which have to be locked, use last column if multiple hits
if chck == len(rows_to_consider)-1 and chck != 0: # item is redundant
for li in lck: # lock items
locked[li[0][0]][li[0][1]] = True
locked[li[1][0]][li[1][1]] = True
x[j][i] = False # delete redundant entry
if np.all(x[:,i] == False): # delete column where each entry is FALSE and restart
x = np.delete(x, i, 1) # delete the i-th index on the 1-st axis ie remove ith column
flag = True
break
#return sweep(x)
if flag:
continue
false_columns = np.where(np.sum(x, axis=0) == 0)[0]
if len(false_columns) > 0:
x = np.delete(x, false_columns, axis=1)
return x
return x
def absorb_sweep(mat: np.ndarray):
"""
:param mat: Ths significant relation matrix: pairwise . mat[t1][t2] is True when t1-t2 relation is significant.
Must be a square symmetric matrix as only the upper triangular matrix is considered.
:return: The boolean matrix of characters, where each column corresponds a unique character and the True values in each column denotes that the corresponding rows are pairwise insignificant.
"""
signif_mat = mat.copy()
insig_mat = signif_mat ^ True # inversion
num_rows = signif_mat.shape[0]
lmat = np.ones((num_rows,1), dtype= np.bool)
for i in range(num_rows):
for j in range(i,num_rows):
if not signif_mat[i][j]:
continue
wrong_cols = np.where(lmat[i] & lmat[j])[0] # true if an alphabet occurs in both i and j but it is significant so shouldnt have occred
correct_cols = [dum for dum in range(lmat.shape[1]) if dum not in wrong_cols ]
if len(wrong_cols) == 0:
continue
lmat_cor = lmat[:,correct_cols]
lmat_wrong = lmat[:, wrong_cols]
for k in range(lmat_wrong.shape[1]):
new_cols = np.zeros((num_rows,2), dtype=np.bool)
new_cols |= lmat_wrong[:, k][:, None]
new_cols[i][0] = False
new_cols[j][1] = False
if len(correct_cols) > 0:
new_col_redundant = np.asarray([np.any(np.all(lmat_cor | (~new_cols[:, l][:, None]), axis=0)) for l in range(2)], dtype=np.bool)
new_cols = new_cols[:, np.where(~new_col_redundant)[0]]
lmat_cor = np.concatenate((lmat_cor, new_cols), axis = 1)
lmat = lmat_cor
lmat = lmat[:,np.argsort(lmat.sum(axis=0))]
lmat = sweep(lmat)
lmat = lmat[:, np.argsort(np.asarray([np.where(lmat[:, l])[0][0] for l in range(lmat.shape[1])]))]
lmat = lmat[:, np.argsort(lmat.sum(axis=0))]
lmat = sweep(lmat)
lmat = lmat[:, np.flip(np.argsort(np.asarray([np.where(lmat[:, l])[0][0] for l in range(lmat.shape[1])])))]
return lmat
def get_letters(mat):
'''
:param mat: the output of absorb-sweep method
:return: a list of strings where each string is the representation of the corresponding row.
'''
lmat = np.chararray(mat.shape)
lmat[:] = ' '
col_names = [chr(c) for c in range(97, 123)] + [chr(c) for c in range(65,91)] + [str(c)[0] for c in range(10)]
if mat.shape[1] > len(col_names):
print('Too Many Columns')
return None
for i in range(mat.shape[1]):
lmat[np.where(mat[:, i])[0], i] = col_names[i]
lmat = [j.tostring().decode('utf-8') for j in lmat]
return lmat
def get_cld_from_df(df, alpha=0.05):
'''
:param df:
:param alpha:
:return: if characters are common between two rows it means that they have a significant similarity
'''
p_values = df.values
# in the paper, if relation is significant, then there is no common character.
# we want no common character if they are dissimilar ie if the p values are < threshold, they are dissimilar and hence significant
signif_mat = p_values < alpha
np.fill_diagonal(signif_mat, False) # row is not different from itself.
return absorb_sweep(signif_mat)
if __name__ == '__main__':
a = np.zeros((8, 8), dtype=np.bool)
b = [(0, 6), (0, 7), (1, 3), (1, 4), (2, 4)]
for m, n in b:
a[m][n] = a[n][m] = True
m = absorb_sweep(a)
# print()
print("\n".join(get_letters(m)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment