Skip to content

Instantly share code, notes, and snippets.

@Habush
Last active April 9, 2021 17:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Habush/23d8e0b3673eb6aff40b00ddd81b52fb to your computer and use it in GitHub Desktop.
Save Habush/23d8e0b3673eb6aff40b00ddd81b52fb to your computer and use it in GitHub Desktop.
Plot the embedding of SVD components and their projections in the same subpsace. Inspired by https://europepmc.org/article/pmc/pmc5054124
def plot_emb_projection(X, y=None, ker=tanimoto_v2, alpha=0.5, params=None, annotate=False):
"""
Plot the row vectors of X and features of X in the same embedded space spanned by PCA Components
:param X: the data matrix or dataframe
:param y: the target variable (for labelling)
:param ker: the kernel function to use
:param alpha: the exponent to use for matrix factorization
:return: The pca projects of the row vectors and the columns
"""
# Do SVD Decomposition
u, d, v_t = scipy.linalg.svd(X, full_matrices=False)
d = np.diag(d)
d_1, d_2 = np.power(d, alpha), np.power(d, 1 - alpha)
P = u @ d_1
G = v_t.T @ d_2
# Apply the kernel on H
if callable(ker):
K = kernel_func(G, G, ker=ker)
# Transform vectors in G to the feature space of K
K_p = kernel_func(P, G, ker=ker)
# Apply PCA
kpca = KernelPCA(kernel="precomputed")
G_pca = kpca.fit_transform(K)
P_pca = kpca.transform(K_p)
else:
kpca = KernelPCA(kernel=ker, **params)
G_pca = kpca.fit_transform(G)
P_pca = kpca.transform(P)
# plot the first two components of G_pca and P_pca on the same plot
markers = {"relapse": ".", "genes": "X", "no_relapse": "+"}
fig, ax = plt.subplots(1, 1, figsize=(16, 12))
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.grid()
# Make dataframes for plotting
G_pca_df = pd.DataFrame(G_pca, index=X.columns)
P_pca_df = pd.DataFrame(P_pca, index=X.index)
if y is not None:
P_pca_df = P_pca_df.join(y)
pos_0_df, pos_1_df = P_pca_df[P_pca_df["posOutcome"] == 0], P_pca_df[P_pca_df["posOutcome"] == 1]
# plt_df = pd.concat([pos_0_df.assign(dataset="relapse"), pos_1_df.assign(dataset="no_relapse") ,P_pca_df.assign(dataset="genes")])
# sns.scatterplot(data=plt_df, x=2, y=3, style="dataset", ax=ax, palette=sns.hls_palette(3), hue="dataset")
ax.scatter(pos_0_df[0], pos_0_df[1], c='r', marker=6, label="0")
ax.scatter(pos_1_df[0], pos_1_df[1], c='g', marker="+", label="1")
else:
ax.scatter(P_pca[:, 0], P_pca[:, 1], c='g', marker="+", label="Patients")
ax.scatter(G_pca_df[0], G_pca_df[1], c='b', marker="x", label="GO/Pathway")
if annotate:
for i in X.columns.to_list():
x, y = G_pca_df.loc[i][0], G_pca_df.loc[i][1]
ax.annotate(i, xy=(x, y), textcoords="offset points")
ax.axvline(x=0)
ax.axhline(y=0)
ax.legend()
return G_pca_df, P_pca_df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment