Skip to content

Instantly share code, notes, and snippets.

@mcanouil
Last active February 2, 2024 23:31
Show Gist options
  • Save mcanouil/12da218d692d5484686c959fbcb920c2 to your computer and use it in GitHub Desktop.
Save mcanouil/12da218d692d5484686c959fbcb920c2 to your computer and use it in GitHub Desktop.
PCA plot
# # MIT License
#
# Copyright (c) 2024 Mickaël Canouil
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
library(flashpcaR)
library(ggplot2)
library(data.table)
library(ggtext)
library(glue)
library(patchwork)
n_comp <- 5
iqr_threshold <- 3
data_matrix <- rlog_counts
pca_res <- flashpca(X = t(data_matrix), stand = "none", ndim = 10)
pca_dfxy <- as.data.table(pca_res[["projection"]], keep.rownames = "ID")
setnames(
x = pca_dfxy,
old = setdiff(names(pca_dfxy), "ID"),
new = sprintf("PC%02d", as.numeric(gsub("V", "", setdiff(names(pca_dfxy), "ID"))))
)
pca_dfxy <- merge(
x = pca_dfxy[, "ID" := as.character(ID)],
y = as.data.table(phenotypes)[, "ID" := as.character(ID)],
by = "ID"
)[,
dist_centre := rowSums(sapply(.SD, function(x) as.vector(scale(x))^2)),
.SDcols = sprintf("PC%02d", 1:3)
][,
is_outlier := factor(
x = dist_centre > (
quantile(dist_centre, 0.75, na.rm = TRUE) + iqr_threshold * IQR(dist_centre, na.rm = TRUE)
),
levels = c(FALSE, TRUE),
labels = c("No", "Yes")
)
]
pca_contrib <- sprintf(
fmt = "PC%02d (%s %%)",
seq_along(pca_res[["pve"]]),
format(
x = round(pca_res[["pve"]] * 100, digits = 3),
digits = 2, nsmall = 2, trim = TRUE, scientific = FALSE
)
)
names(pca_contrib) <- sprintf("PC%02d", seq_along(pca_res[["pve"]]))
p_inertia <- ggplot(
data = data.table(
y = pca_res[["pve"]],
x = sprintf("PC%02d", seq_along(pca_res[["pve"]]))
)[1:n_comp],
mapping = aes(
x = paste0(
.data[["x"]],
"<br><i style='font-size:5pt;'>(",
format(.data[["y"]] * 100, digits = 2, nsmall = 2, trim = TRUE, scientific = FALSE),
" %)</i>"
),
y = .data[["y"]]
)
) +
geom_col(
width = 1,
colour = "white",
fill = viridis_pal(begin = 0.5, end = 0.5)(1),
na.rm = TRUE
) +
scale_y_continuous(
labels = function(x) paste(format(x * 100, digits = 2, nsmall = 2), "%"),
expand = expansion(mult = c(0, 0.05))
) +
labs(x = "Principal Component", y = "Contribution")
plot_list <- lapply(as.list(as.data.frame(combn(1:3, 2))), function(xy) {
ggplot(
data = pca_dfxy,
mapping = aes(
x = .data[[sprintf("PC%02d", xy[1])]],
y = .data[[sprintf("PC%02d", xy[2])]],
colour = is_outlier,
shape = is_outlier
)
) +
geom_hline(yintercept = 0, linetype = 2, na.rm = TRUE) +
geom_vline(xintercept = 0, linetype = 2, na.rm = TRUE) +
geom_point(na.rm = TRUE) +
stat_ellipse(type = "norm", na.rm = TRUE, show.legend = FALSE) +
scale_colour_viridis_d(
begin = 0.25,
end = 0.75,
guide = guide_legend(override.aes = list(size = 4)),
drop = FALSE
) +
scale_shape_manual(values = c(1, 4), drop = FALSE) +
geom_label_repel(
mapping = aes(label = label),
data = ~ .x[is_outlier %in% "Yes", label := ID],
fill = "white",
colour = "black",
segment.colour = "black",
min.segment.length = 0,
show.legend = FALSE,
na.rm = TRUE
) +
labs(
x = pca_contrib[sprintf("PC%02d", xy[1])],
y = pca_contrib[sprintf("PC%02d", xy[2])],
colour = "Outlier<sup>&dagger;</sup>",
shape = "Outlier<sup>&dagger;</sup>"
) +
coord_cartesian(
xlim = range(pca_dfxy[[sprintf("PC%02d", xy[1])]]),
ylim = range(pca_dfxy[[sprintf("PC%02d", xy[2])]])
)
})
wrap_plots(c(plot_list, list(p_inertia)), guides = "collect") +
plot_annotation(
title = "Principal Component Analysis",
subtitle = "Outliers Detection",
caption = glue(
"<sup>&dagger;</sup>Outliers defined for a Euclidean distance from cohort centroid (based on the principal components up to 3)<br>",
"higher than {iqr_threshold} times the interquartile range above the 75<sup>th</sup> percentile."
),
tag_levels = "A"
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment