Last active
March 5, 2020 08:12
-
-
Save rjs211/4c128a593a8184f97c847196ea8a53ee to your computer and use it in GitHub Desktop.
Compact Letter display (CLD) of all pairwise comparisons (similar to cld.r)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
def sweep(mat : np.ndarray): | |
""" | |
:param mat: The lmat generated after absorption phase , a np ndarray matrix of bool | |
:return: It returns the matrix after removing redundant entries | |
This function is pretty much a copy of sweep method from the corresponding R package https://github.com/cran/multcomp/blob/master/R/cld.R | |
""" | |
x = mat.copy() | |
while True: | |
locked = np.zeros(x.shape, dtype=np.bool) # True indicates that another letter dependes on this entry | |
flag = False | |
for i in range(x.shape[1]): # for each column | |
tmp = np.zeros(x.shape, dtype=np.bool) | |
rows_to_consider = np.where(x[:, i])[0] # get items of those rows which are TRUE in col "i" | |
tmp[rows_to_consider] = x[rows_to_consider] | |
one = rows_to_consider | |
flag = False | |
if np.any(np.sum(tmp, axis=1) == 1): # there is at least one row "l" where mat[l,i] is the only item which is TRUE i.e. no item can be removed in this column | |
continue | |
for j in rows_to_consider: | |
if locked[j][i]: | |
continue | |
chck = 0 | |
lck = [] | |
for k in rows_to_consider: | |
if j == k: | |
continue | |
rows = tmp[[j, k]] | |
dbl = rows[0] & rows[1] # true in more than 1 column (it is already true in column i) | |
hit = list(np.where(dbl)[0]) | |
hit = [dum for dum in hit if dum != i] | |
if len(hit) > 0: | |
chck += 1 | |
lck.append([(j, hit[-1]), (k, hit[-1])]) # record items which have to be locked, use last column if multiple hits | |
if chck == len(rows_to_consider)-1 and chck != 0: # item is redundant | |
for li in lck: # lock items | |
locked[li[0][0]][li[0][1]] = True | |
locked[li[1][0]][li[1][1]] = True | |
x[j][i] = False # delete redundant entry | |
if np.all(x[:,i] == False): # delete column where each entry is FALSE and restart | |
x = np.delete(x, i, 1) # delete the i-th index on the 1-st axis ie remove ith column | |
flag = True | |
break | |
#return sweep(x) | |
if flag: | |
continue | |
false_columns = np.where(np.sum(x, axis=0) == 0)[0] | |
if len(false_columns) > 0: | |
x = np.delete(x, false_columns, axis=1) | |
return x | |
return x | |
def absorb_sweep(mat: np.ndarray): | |
""" | |
:param mat: Ths significant relation matrix: pairwise . mat[t1][t2] is True when t1-t2 relation is significant. | |
Must be a square symmetric matrix as only the upper triangular matrix is considered. | |
:return: The boolean matrix of characters, where each column corresponds a unique character and the True values in each column denotes that the corresponding rows are pairwise insignificant. | |
""" | |
signif_mat = mat.copy() | |
insig_mat = signif_mat ^ True # inversion | |
num_rows = signif_mat.shape[0] | |
lmat = np.ones((num_rows,1), dtype= np.bool) | |
for i in range(num_rows): | |
for j in range(i,num_rows): | |
if not signif_mat[i][j]: | |
continue | |
wrong_cols = np.where(lmat[i] & lmat[j])[0] # true if an alphabet occurs in both i and j but it is significant so shouldnt have occred | |
correct_cols = [dum for dum in range(lmat.shape[1]) if dum not in wrong_cols ] | |
if len(wrong_cols) == 0: | |
continue | |
lmat_cor = lmat[:,correct_cols] | |
lmat_wrong = lmat[:, wrong_cols] | |
for k in range(lmat_wrong.shape[1]): | |
new_cols = np.zeros((num_rows,2), dtype=np.bool) | |
new_cols |= lmat_wrong[:, k][:, None] | |
new_cols[i][0] = False | |
new_cols[j][1] = False | |
if len(correct_cols) > 0: | |
new_col_redundant = np.asarray([np.any(np.all(lmat_cor | (~new_cols[:, l][:, None]), axis=0)) for l in range(2)], dtype=np.bool) | |
new_cols = new_cols[:, np.where(~new_col_redundant)[0]] | |
lmat_cor = np.concatenate((lmat_cor, new_cols), axis = 1) | |
lmat = lmat_cor | |
lmat = lmat[:,np.argsort(lmat.sum(axis=0))] | |
lmat = sweep(lmat) | |
lmat = lmat[:, np.argsort(np.asarray([np.where(lmat[:, l])[0][0] for l in range(lmat.shape[1])]))] | |
lmat = lmat[:, np.argsort(lmat.sum(axis=0))] | |
lmat = sweep(lmat) | |
lmat = lmat[:, np.flip(np.argsort(np.asarray([np.where(lmat[:, l])[0][0] for l in range(lmat.shape[1])])))] | |
return lmat | |
def get_letters(mat): | |
''' | |
:param mat: the output of absorb-sweep method | |
:return: a list of strings where each string is the representation of the corresponding row. | |
''' | |
lmat = np.chararray(mat.shape) | |
lmat[:] = ' ' | |
col_names = [chr(c) for c in range(97, 123)] + [chr(c) for c in range(65,91)] + [str(c)[0] for c in range(10)] | |
if mat.shape[1] > len(col_names): | |
print('Too Many Columns') | |
return None | |
for i in range(mat.shape[1]): | |
lmat[np.where(mat[:, i])[0], i] = col_names[i] | |
lmat = [j.tostring().decode('utf-8') for j in lmat] | |
return lmat | |
def get_cld_from_df(df, alpha=0.05): | |
''' | |
:param df: | |
:param alpha: | |
:return: if characters are common between two rows it means that they have a significant similarity | |
''' | |
p_values = df.values | |
# in the paper, if relation is significant, then there is no common character. | |
# we want no common character if they are dissimilar ie if the p values are < threshold, they are dissimilar and hence significant | |
signif_mat = p_values < alpha | |
np.fill_diagonal(signif_mat, False) # row is not different from itself. | |
return absorb_sweep(signif_mat) | |
if __name__ == '__main__': | |
a = np.zeros((8, 8), dtype=np.bool) | |
b = [(0, 6), (0, 7), (1, 3), (1, 4), (2, 4)] | |
for m, n in b: | |
a[m][n] = a[n][m] = True | |
m = absorb_sweep(a) | |
# print() | |
print("\n".join(get_letters(m))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment