rjs211/cld-py.py

## cld-py.py
import numpy as np


def sweep(mat : np.ndarray):
    """

    :param mat:  The lmat generated after absorption phase , a np ndarray matrix of bool
    :return: It returns the matrix after removing redundant entries

    This function is pretty much a copy of sweep method from the corresponding R package https://github.com/cran/multcomp/blob/master/R/cld.R
    """
    x = mat.copy()
    while True:
        locked = np.zeros(x.shape, dtype=np.bool) # True indicates that another letter dependes on this entry
        flag = False
        for i in range(x.shape[1]): # for each column
            tmp = np.zeros(x.shape, dtype=np.bool)
            rows_to_consider = np.where(x[:, i])[0] # get items of those rows which are TRUE in col "i"
            tmp[rows_to_consider] = x[rows_to_consider]
            one = rows_to_consider
            flag = False
            if np.any(np.sum(tmp, axis=1) == 1): # there is at least one row "l" where mat[l,i] is the only item which is TRUE i.e. no item can be removed in this column
                continue
            for j in rows_to_consider:
                if locked[j][i]:
                    continue

                chck = 0
                lck = []
                for k in rows_to_consider:
                    if j == k:
                        continue
                    rows = tmp[[j, k]]
                    dbl = rows[0] & rows[1]  # true in more than 1 column (it is already true in column i)
                    hit = list(np.where(dbl)[0])
                    hit = [dum for dum in hit if dum != i]
                    if len(hit) > 0:
                        chck += 1
                        lck.append([(j, hit[-1]), (k, hit[-1])]) # record items which have to be locked, use last column if multiple hits
                if chck == len(rows_to_consider)-1 and chck != 0: # item is redundant
                    for li in lck:                  # lock items
                        locked[li[0][0]][li[0][1]] = True
                        locked[li[1][0]][li[1][1]] = True
                    x[j][i] = False    # delete redundant entry

            if np.all(x[:,i] == False):   # delete column where each entry is FALSE and restart
                x = np.delete(x, i, 1) # delete the i-th index on the 1-st axis ie remove ith column
                flag = True
                break
                #return sweep(x)
        if flag:
            continue
        false_columns = np.where(np.sum(x, axis=0) == 0)[0]
        if len(false_columns) > 0:
            x = np.delete(x, false_columns, axis=1)
        return x

    return x


def absorb_sweep(mat: np.ndarray):
    """

    :param mat:  Ths significant relation matrix: pairwise . mat[t1][t2] is True when t1-t2 relation is significant.
                Must be a square symmetric matrix as only the upper triangular matrix is considered.
    :return: The boolean matrix of characters, where each column corresponds a unique character and the True values in each column denotes that the corresponding rows are pairwise insignificant.
    """
    signif_mat = mat.copy()
    insig_mat = signif_mat ^ True # inversion
    num_rows = signif_mat.shape[0]
    lmat = np.ones((num_rows,1), dtype= np.bool)
    for i in range(num_rows):
        for j in range(i,num_rows):
            if not signif_mat[i][j]:
                continue
            wrong_cols = np.where(lmat[i] & lmat[j])[0] # true if an alphabet occurs in both i and j but it is significant so shouldnt have occred
            correct_cols = [dum for dum in range(lmat.shape[1]) if dum not in wrong_cols ]
            if len(wrong_cols) == 0:
                continue
            lmat_cor = lmat[:,correct_cols]
            lmat_wrong = lmat[:, wrong_cols]
            for k in range(lmat_wrong.shape[1]):
                new_cols = np.zeros((num_rows,2), dtype=np.bool)
                new_cols |= lmat_wrong[:, k][:, None]
                new_cols[i][0] = False
                new_cols[j][1] = False
                if len(correct_cols) > 0:
                    new_col_redundant = np.asarray([np.any(np.all(lmat_cor | (~new_cols[:, l][:, None]), axis=0)) for l in range(2)], dtype=np.bool)
                    new_cols = new_cols[:, np.where(~new_col_redundant)[0]]

                lmat_cor = np.concatenate((lmat_cor, new_cols), axis = 1)
            lmat = lmat_cor
    lmat = lmat[:,np.argsort(lmat.sum(axis=0))]
    lmat = sweep(lmat)
    lmat = lmat[:, np.argsort(np.asarray([np.where(lmat[:, l])[0][0] for l in range(lmat.shape[1])]))]
    lmat = lmat[:, np.argsort(lmat.sum(axis=0))]
    lmat = sweep(lmat)
    lmat = lmat[:, np.flip(np.argsort(np.asarray([np.where(lmat[:, l])[0][0] for l in range(lmat.shape[1])])))]
    return lmat

def get_letters(mat):
    '''

    :param mat: the output of absorb-sweep method
    :return: a list of strings where each string is the representation of the corresponding row.
    '''
    lmat = np.chararray(mat.shape)
    lmat[:] = ' '
    col_names = [chr(c) for c in range(97, 123)] + [chr(c) for c in range(65,91)] + [str(c)[0] for c in range(10)]
    if mat.shape[1] > len(col_names):
        print('Too Many Columns')
        return None
    for i in range(mat.shape[1]):
        lmat[np.where(mat[:, i])[0], i] = col_names[i]
    lmat = [j.tostring().decode('utf-8') for j in lmat]
    return lmat


def get_cld_from_df(df, alpha=0.05):
    '''

    :param df:
    :param alpha:
    :return: if characters are common between two rows it means that they have a significant similarity
    '''

    p_values = df.values
    # in the paper, if relation is significant, then there is no common character.
    # we want no common character if they are dissimilar ie if the p values are < threshold, they are dissimilar and hence significant

    signif_mat = p_values < alpha
    np.fill_diagonal(signif_mat, False) # row is not different from itself.
    return absorb_sweep(signif_mat)


if __name__ == '__main__':
    a = np.zeros((8, 8), dtype=np.bool)
    b = [(0, 6), (0, 7), (1, 3), (1, 4), (2, 4)]

    for m, n in b:
        a[m][n] = a[n][m] = True

    m = absorb_sweep(a)
    # print()
    print("\n".join(get_letters(m)))
	import numpy as np


	def sweep(mat : np.ndarray):
	"""

	:param mat: The lmat generated after absorption phase , a np ndarray matrix of bool
	:return: It returns the matrix after removing redundant entries

	This function is pretty much a copy of sweep method from the corresponding R package https://github.com/cran/multcomp/blob/master/R/cld.R
	"""
	x = mat.copy()
	while True:
	locked = np.zeros(x.shape, dtype=np.bool) # True indicates that another letter dependes on this entry
	flag = False
	for i in range(x.shape[1]): # for each column
	tmp = np.zeros(x.shape, dtype=np.bool)
	rows_to_consider = np.where(x[:, i])[0] # get items of those rows which are TRUE in col "i"
	tmp[rows_to_consider] = x[rows_to_consider]
	one = rows_to_consider
	flag = False
	if np.any(np.sum(tmp, axis=1) == 1): # there is at least one row "l" where mat[l,i] is the only item which is TRUE i.e. no item can be removed in this column
	continue
	for j in rows_to_consider:
	if locked[j][i]:
	continue

	chck = 0
	lck = []
	for k in rows_to_consider:
	if j == k:
	continue
	rows = tmp[[j, k]]
	dbl = rows[0] & rows[1] # true in more than 1 column (it is already true in column i)
	hit = list(np.where(dbl)[0])
	hit = [dum for dum in hit if dum != i]
	if len(hit) > 0:
	chck += 1
	lck.append([(j, hit[-1]), (k, hit[-1])]) # record items which have to be locked, use last column if multiple hits
	if chck == len(rows_to_consider)-1 and chck != 0: # item is redundant
	for li in lck: # lock items
	locked[li[0][0]][li[0][1]] = True
	locked[li[1][0]][li[1][1]] = True
	x[j][i] = False # delete redundant entry

	if np.all(x[:,i] == False): # delete column where each entry is FALSE and restart
	x = np.delete(x, i, 1) # delete the i-th index on the 1-st axis ie remove ith column
	flag = True
	break
	#return sweep(x)
	if flag:
	continue
	false_columns = np.where(np.sum(x, axis=0) == 0)[0]
	if len(false_columns) > 0:
	x = np.delete(x, false_columns, axis=1)
	return x

	return x


	def absorb_sweep(mat: np.ndarray):
	"""

	:param mat: Ths significant relation matrix: pairwise . mat[t1][t2] is True when t1-t2 relation is significant.
	Must be a square symmetric matrix as only the upper triangular matrix is considered.
	:return: The boolean matrix of characters, where each column corresponds a unique character and the True values in each column denotes that the corresponding rows are pairwise insignificant.
	"""
	signif_mat = mat.copy()
	insig_mat = signif_mat ^ True # inversion
	num_rows = signif_mat.shape[0]
	lmat = np.ones((num_rows,1), dtype= np.bool)
	for i in range(num_rows):
	for j in range(i,num_rows):
	if not signif_mat[i][j]:
	continue
	wrong_cols = np.where(lmat[i] & lmat[j])[0] # true if an alphabet occurs in both i and j but it is significant so shouldnt have occred
	correct_cols = [dum for dum in range(lmat.shape[1]) if dum not in wrong_cols ]
	if len(wrong_cols) == 0:
	continue
	lmat_cor = lmat[:,correct_cols]
	lmat_wrong = lmat[:, wrong_cols]
	for k in range(lmat_wrong.shape[1]):
	new_cols = np.zeros((num_rows,2), dtype=np.bool)
	new_cols \|= lmat_wrong[:, k][:, None]
	new_cols[i][0] = False
	new_cols[j][1] = False
	if len(correct_cols) > 0:
	new_col_redundant = np.asarray([np.any(np.all(lmat_cor \| (~new_cols[:, l][:, None]), axis=0)) for l in range(2)], dtype=np.bool)
	new_cols = new_cols[:, np.where(~new_col_redundant)[0]]

	lmat_cor = np.concatenate((lmat_cor, new_cols), axis = 1)
	lmat = lmat_cor
	lmat = lmat[:,np.argsort(lmat.sum(axis=0))]
	lmat = sweep(lmat)
	lmat = lmat[:, np.argsort(np.asarray([np.where(lmat[:, l])[0][0] for l in range(lmat.shape[1])]))]
	lmat = lmat[:, np.argsort(lmat.sum(axis=0))]
	lmat = sweep(lmat)
	lmat = lmat[:, np.flip(np.argsort(np.asarray([np.where(lmat[:, l])[0][0] for l in range(lmat.shape[1])])))]
	return lmat

	def get_letters(mat):
	'''

	:param mat: the output of absorb-sweep method
	:return: a list of strings where each string is the representation of the corresponding row.
	'''
	lmat = np.chararray(mat.shape)
	lmat[:] = ' '
	col_names = [chr(c) for c in range(97, 123)] + [chr(c) for c in range(65,91)] + [str(c)[0] for c in range(10)]
	if mat.shape[1] > len(col_names):
	print('Too Many Columns')
	return None
	for i in range(mat.shape[1]):
	lmat[np.where(mat[:, i])[0], i] = col_names[i]
	lmat = [j.tostring().decode('utf-8') for j in lmat]
	return lmat


	def get_cld_from_df(df, alpha=0.05):
	'''

	:param df:
	:param alpha:
	:return: if characters are common between two rows it means that they have a significant similarity
	'''

	p_values = df.values
	# in the paper, if relation is significant, then there is no common character.
	# we want no common character if they are dissimilar ie if the p values are < threshold, they are dissimilar and hence significant

	signif_mat = p_values < alpha
	np.fill_diagonal(signif_mat, False) # row is not different from itself.
	return absorb_sweep(signif_mat)


	if __name__ == '__main__':
	a = np.zeros((8, 8), dtype=np.bool)
	b = [(0, 6), (0, 7), (1, 3), (1, 4), (2, 4)]

	for m, n in b:
	a[m][n] = a[n][m] = True

	m = absorb_sweep(a)
	# print()
	print("\n".join(get_letters(m)))