Habush/plot_emb_pca_projection.py

## plot_emb_pca_projection.py
def plot_emb_projection(X, y=None, ker=tanimoto_v2, alpha=0.5, params=None, annotate=False):
    """
    Plot the row vectors of X and features of X in the same embedded space spanned by PCA Components
    :param X: the data matrix or dataframe
    :param y: the target variable (for labelling)
    :param ker: the kernel function to use
    :param alpha: the exponent to use for matrix factorization
    :return: The pca projects of the row vectors and the columns
    """

    # Do SVD Decomposition
    u, d, v_t = scipy.linalg.svd(X, full_matrices=False)
    d = np.diag(d)
    d_1, d_2 = np.power(d, alpha), np.power(d, 1 - alpha)
    P = u @ d_1
    G = v_t.T @ d_2
    # Apply the kernel on H
    if callable(ker):
        K = kernel_func(G, G, ker=ker)
        # Transform vectors in G to the feature space of K
        K_p = kernel_func(P, G, ker=ker)
        # Apply PCA
        kpca = KernelPCA(kernel="precomputed")
        G_pca = kpca.fit_transform(K)
        P_pca = kpca.transform(K_p)
    else:
        kpca = KernelPCA(kernel=ker, **params)
        G_pca = kpca.fit_transform(G)
        P_pca = kpca.transform(P)

    # plot the first two components of G_pca and P_pca on the same plot
    markers = {"relapse": ".", "genes": "X", "no_relapse": "+"}
    fig, ax = plt.subplots(1, 1, figsize=(16, 12))
    ax.set_xlabel("PC1")
    ax.set_ylabel("PC2")

    ax.grid()

    # Make dataframes for plotting
    G_pca_df = pd.DataFrame(G_pca, index=X.columns)
    P_pca_df = pd.DataFrame(P_pca, index=X.index)
    if y is not None:
        P_pca_df = P_pca_df.join(y)
        pos_0_df, pos_1_df = P_pca_df[P_pca_df["posOutcome"] == 0], P_pca_df[P_pca_df["posOutcome"] == 1]
        # plt_df = pd.concat([pos_0_df.assign(dataset="relapse"), pos_1_df.assign(dataset="no_relapse") ,P_pca_df.assign(dataset="genes")])
        # sns.scatterplot(data=plt_df, x=2, y=3, style="dataset", ax=ax, palette=sns.hls_palette(3), hue="dataset")
        ax.scatter(pos_0_df[0], pos_0_df[1], c='r', marker=6, label="0")
        ax.scatter(pos_1_df[0], pos_1_df[1], c='g', marker="+", label="1")
    else:
        ax.scatter(P_pca[:, 0], P_pca[:, 1], c='g', marker="+", label="Patients")
    ax.scatter(G_pca_df[0], G_pca_df[1], c='b', marker="x", label="GO/Pathway")
    if annotate:
        for i in X.columns.to_list():
            x, y = G_pca_df.loc[i][0], G_pca_df.loc[i][1]
            ax.annotate(i, xy=(x, y), textcoords="offset points")
    ax.axvline(x=0)
    ax.axhline(y=0)
    ax.legend()

    return G_pca_df, P_pca_df
	def plot_emb_projection(X, y=None, ker=tanimoto_v2, alpha=0.5, params=None, annotate=False):
	"""
	Plot the row vectors of X and features of X in the same embedded space spanned by PCA Components
	:param X: the data matrix or dataframe
	:param y: the target variable (for labelling)
	:param ker: the kernel function to use
	:param alpha: the exponent to use for matrix factorization
	:return: The pca projects of the row vectors and the columns
	"""

	# Do SVD Decomposition
	u, d, v_t = scipy.linalg.svd(X, full_matrices=False)
	d = np.diag(d)
	d_1, d_2 = np.power(d, alpha), np.power(d, 1 - alpha)
	P = u @ d_1
	G = v_t.T @ d_2
	# Apply the kernel on H
	if callable(ker):
	K = kernel_func(G, G, ker=ker)
	# Transform vectors in G to the feature space of K
	K_p = kernel_func(P, G, ker=ker)
	# Apply PCA
	kpca = KernelPCA(kernel="precomputed")
	G_pca = kpca.fit_transform(K)
	P_pca = kpca.transform(K_p)
	else:
	kpca = KernelPCA(kernel=ker, **params)
	G_pca = kpca.fit_transform(G)
	P_pca = kpca.transform(P)

	# plot the first two components of G_pca and P_pca on the same plot
	markers = {"relapse": ".", "genes": "X", "no_relapse": "+"}
	fig, ax = plt.subplots(1, 1, figsize=(16, 12))
	ax.set_xlabel("PC1")
	ax.set_ylabel("PC2")

	ax.grid()

	# Make dataframes for plotting
	G_pca_df = pd.DataFrame(G_pca, index=X.columns)
	P_pca_df = pd.DataFrame(P_pca, index=X.index)
	if y is not None:
	P_pca_df = P_pca_df.join(y)
	pos_0_df, pos_1_df = P_pca_df[P_pca_df["posOutcome"] == 0], P_pca_df[P_pca_df["posOutcome"] == 1]
	# plt_df = pd.concat([pos_0_df.assign(dataset="relapse"), pos_1_df.assign(dataset="no_relapse") ,P_pca_df.assign(dataset="genes")])
	# sns.scatterplot(data=plt_df, x=2, y=3, style="dataset", ax=ax, palette=sns.hls_palette(3), hue="dataset")
	ax.scatter(pos_0_df[0], pos_0_df[1], c='r', marker=6, label="0")
	ax.scatter(pos_1_df[0], pos_1_df[1], c='g', marker="+", label="1")
	else:
	ax.scatter(P_pca[:, 0], P_pca[:, 1], c='g', marker="+", label="Patients")
	ax.scatter(G_pca_df[0], G_pca_df[1], c='b', marker="x", label="GO/Pathway")
	if annotate:
	for i in X.columns.to_list():
	x, y = G_pca_df.loc[i][0], G_pca_df.loc[i][1]
	ax.annotate(i, xy=(x, y), textcoords="offset points")
	ax.axvline(x=0)
	ax.axhline(y=0)
	ax.legend()

	return G_pca_df, P_pca_df