Skip to content

Instantly share code, notes, and snippets.

@mcanouil
Last active February 2, 2024 23:31
Show Gist options
  • Save mcanouil/29a33d21f2c21be9c59a1a59e8ca6d9d to your computer and use it in GitHub Desktop.
Save mcanouil/29a33d21f2c21be9c59a1a59e8ca6d9d to your computer and use it in GitHub Desktop.
Check Tissue From RNAseq
# # MIT License
#
# Copyright (c) 2024 Mickaël Canouil
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
check_tissue <- function(
genes_file,
tissue_ref,
threshold = 1,
n_comp = 3,
n_iqr = 3,
draw_label = FALSE,
norm = DESeq2::rlog,
title = "RNAseq Principal Component Analysis",
url = "https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz"
) {
my_theme <- ggplot2::theme_light() +
ggplot2::theme(
strip.background = ggplot2::element_rect(fill = "white", colour = "grey70"),
strip.text = ggtext::element_markdown(
colour = "black",
size = ggplot2::rel(0.4),
margin = ggplot2::margin(1, 1, 1, 1, unit = "pt")
),
plot.title.position = "plot",
plot.caption.position = "plot",
plot.title = ggtext::element_markdown(),
plot.subtitle = ggtext::element_markdown(face = "italic", size = ggplot2::rel(0.8)),
plot.caption = ggtext::element_markdown(face = "italic", size = ggplot2::rel(0.5)),
axis.text.x = ggtext::element_markdown(size = ggplot2::rel(0.5)),
axis.text.y = ggtext::element_markdown(size = ggplot2::rel(0.5)),
axis.title.x = ggtext::element_markdown(size = ggplot2::rel(0.75)),
axis.title.y = ggtext::element_markdown(size = ggplot2::rel(0.75)),
legend.title = ggtext::element_markdown(),
legend.text = ggtext::element_markdown(),
legend.position = "top",
legend.direction = "horizontal",
panel.spacing = ggplot2::unit(0, "npc"),
panel.grid.minor = ggplot2::element_blank()
)
pretty_pve <- function(pve) {
pca_contrib <- sprintf(
fmt = "PC%02d (%s %%)",
seq_along(pve),
format(
x = round(pve * 100, digits = 3),
digits = 2, nsmall = 2, trim = TRUE, scientific = FALSE
)
)
names(pca_contrib) <- sprintf("PC%02d", seq_along(pve))
pca_contrib
}
detect_outliers <- function(vectors, n_iqr, n_comp) {
pca_dfxy <- data.table::as.data.table(vectors, keep.rownames = "RNA_ID")
data.table::setnames(
x = pca_dfxy,
old = grep("^V[0-9]", names(pca_dfxy), value = TRUE),
new = sprintf("PC%02d", as.numeric(gsub("V", "", grep("^V[0-9]", names(pca_dfxy), value = TRUE))))
)
pca_dfxy[,
dist_centre := rowSums(sapply(.SD, function(x) as.vector(scale(x))^2)),
.SDcols = sprintf("PC%02d", 1:n_comp)
][,
is_outlier := factor(
x = dist_centre > (
stats::quantile(dist_centre, 0.75, na.rm = TRUE) +
n_iqr * stats::IQR(dist_centre, na.rm = TRUE)
),
levels = c(FALSE, TRUE),
labels = c("No", "Yes")
)
]
pca_dfxy
}
plot_plane <- function(pca_contrib, pca_dfxy, n_comp) {
lapply(
X = as.list(as.data.frame(utils::combn(1:n_comp, 2))),
pca_contrib = pca_contrib, pca_dfxy = pca_dfxy,
function(xy, pca_contrib, pca_dfxy) {
ggplot2::ggplot(
data = pca_dfxy,
mapping = ggplot2::aes(
x = .data[[sprintf("PC%02d", xy[1])]],
y = .data[[sprintf("PC%02d", xy[2])]],
colour = is_outlier,
shape = is_outlier
)
) +
ggplot2::geom_hline(yintercept = 0, linetype = 2, na.rm = TRUE) +
ggplot2::geom_vline(xintercept = 0, linetype = 2, na.rm = TRUE) +
ggplot2::geom_point(na.rm = TRUE, size = 0.75) +
ggplot2::stat_ellipse(type = "norm", na.rm = TRUE, show.legend = FALSE) +
ggplot2::scale_colour_viridis_d(
begin = 0.25,
end = 0.75,
guide = ggplot2::guide_legend(override.aes = list(size = 4)),
drop = FALSE
) +
ggplot2::scale_shape_manual(values = c(1, 4), drop = FALSE) +
{
if (draw_label) {
ggrepel::geom_label_repel(
mapping = ggplot2::aes(label = RNA_ID),
data = ~ .x[is_outlier %in% "Yes"],
size = 2,
fill = "white",
colour = "black",
segment.colour = "black",
min.segment.length = 0,
show.legend = FALSE
)
}
} +
ggplot2::scale_x_continuous(
breaks = scales::extended_breaks(3),
expand = ggplot2::expansion(c(0.15, 0.15))
) +
ggplot2::scale_y_continuous(
breaks = scales::extended_breaks(3),
expand = ggplot2::expansion(c(0.15, 0.15))
) +
ggplot2::labs(
x = pca_contrib[sprintf("PC%02d", xy[1])],
y = pca_contrib[sprintf("PC%02d", xy[2])],
colour = glue::glue('Possible Outlier<i><sup>1</sup></i>'),
shape = glue::glue('Possible Outlier<i><sup>1</sup></i>')
) +
ggplot2::coord_cartesian(
xlim = range(pca_dfxy[[sprintf("PC%02d", xy[1])]]),
ylim = range(pca_dfxy[[sprintf("PC%02d", xy[2])]])
) +
my_theme
}
)
}
## Get GTEx data
data_file <- basename(url)
if (!file.exists(file.path(tempdir(), data_file))) {
utils::download.file(url = url, destfile = file.path(tempdir(), data_file))
}
gtex_data <- data.table::fread(file.path(tempdir(), data_file))
gtex_data_rlog <- norm(as.matrix(round(gtex_data[, -c("Name", "Description")], digits = 0)))
gtex_data[, colnames(gtex_data_rlog) := as.data.frame(gtex_data_rlog)]
gtex_data <- data.table::melt(
data = gtex_data,
id.vars = c("Name", "Description"),
variable.name = "Tissue",
value.name = "TPM"
)[, c("group", "Name") := list(gsub("(.*) - .*", "\\1", Tissue), gsub("\\..*$", "", Name))]
## Import counts
txi_counts <- tximport::tximport(
files = genes_file,
type = "rsem",
txIn = FALSE,
txOut = FALSE,
countsFromAbundance = "no"
)
txi_counts$length[txi_counts$length == 0] <- 1
norm_counts <- SummarizedExperiment::assay(norm(DESeq2::DESeqDataSetFromTximport(
txi = txi_counts,
colData = data.frame(intercept = rep(1, ncol(txi_counts$counts))),
design = ~ 1
)))
pca_res <- flashpcaR::flashpca(
X = t(norm_counts),
stand = "center",
ndim = n_comp,
do_loadings = TRUE
)
pca_contrib <- pretty_pve(pca_res[["pve"]])
## Combine
tissue_genes <- merge(
x = data.table::as.data.table(pca_res[["loadings"]])[, Name := rownames(norm_counts)],
y = gtex_data,
by = "Name"
)[
TPM > (mean(TPM) + 3 * sd(TPM)), N := .N, by = Name
][N %in% 1][, N := .N, by = Tissue][N > 3]
data.table::setnames(
x = tissue_genes,
old = c(
"Name",
grep("^V[0-9]", names(tissue_genes), value = TRUE)
),
new = c(
"Gene",
sprintf("PC%02d", as.numeric(gsub("V", "", grep("^V[0-9]", names(tissue_genes), value = TRUE))))
)
)
ref_tissue <- data.table::rbindlist(lapply(tissue_ref, function(itissue_ref) {
if (itissue_ref %in% tissue_genes[, unique(Tissue)]) {
tissue_genes[Tissue %in% itissue_ref]
} else {
tissue_genes[grepl(itissue_ref, Tissue, ignore.case = TRUE)]
}
}))
ref_range <- ref_tissue[, lapply(.SD, function(x) {
rx <- range(x)
drx <- diff(rx) * threshold * sign(rx)
rx + drx
}), .SDcols = sprintf("PC%02d", 1:n_comp)]
tissue_genes[,
(sprintf("within_%02d", 1:n_comp)) := lapply(.SD, function(x) {
min(x) < min(ref_range[[substitute(x)]]) | max(x) > max(ref_range[[substitute(x)]])
}),
.SDcols = sprintf("PC%02d", 1:n_comp),
by = Tissue
][,
keep := rowSums(.SD) > 0 | Tissue %in% unique(ref_tissue[["Tissue"]]),
.SDcols = sprintf("within_%02d", 1:n_comp)
]
tissue_genes <- tissue_genes[(keep)]
plot_list_tissue <- lapply(as.list(as.data.frame(combn(1:n_comp, 2))), function(xy) {
ggplot2::ggplot(
data = tissue_genes,
mapping = ggplot2::aes(
x = .data[[sprintf("PC%02d", xy[1])]],
y = .data[[sprintf("PC%02d", xy[2])]],
colour = Tissue
)
) +
ggplot2::geom_hline(yintercept = 0, linetype = 2, na.rm = TRUE) +
ggplot2::geom_vline(xintercept = 0, linetype = 2, na.rm = TRUE) +
ggplot2::geom_point(na.rm = TRUE, size = 0.75, shape = 1) +
ggplot2::stat_ellipse(type = "norm", na.rm = TRUE, show.legend = FALSE) +
ggplot2::scale_x_continuous(
breaks = scales::extended_breaks(3),
expand = ggplot2::expansion(c(0.15, 0.15))) +
ggplot2::scale_y_continuous(
breaks = scales::extended_breaks(3),
expand = ggplot2::expansion(c(0.15, 0.15))
) +
ggplot2::labs(
x = paste0("Loadings<br>", pca_contrib[sprintf("PC%02d", xy[1])]),
y = paste0("Loadings<br>", pca_contrib[sprintf("PC%02d", xy[2])])
) +
ggplot2::coord_cartesian(
xlim = range(tissue_genes[[sprintf("PC%02d", xy[1])]]),
ylim = range(tissue_genes[[sprintf("PC%02d", xy[2])]])
) +
ggplot2::facet_wrap(
facets = ggplot2::vars(Tissue),
labeller = ggplot2::labeller(.default = function(x) gsub(" - ", "<br>", toupper(x)))
) +
my_theme +
ggplot2::theme(legend.position = "none")
})
### Samples check
pca_dfxy <- detect_outliers(pca_res[["vectors"]], n_iqr, n_comp)
plot_list_samples <- plot_plane(
pca_contrib = pca_contrib,
pca_dfxy = pca_dfxy,
n_comp = n_comp
)
### Check PCA on tissue
tissue_genes_list <- tissue_genes[!Tissue %in% ref_tissue[["Tissue"]]][["Gene"]]
if (length(tissue_genes_list) > 0) {
pca_res <- flashpcaR::flashpca(
X = t(norm_counts[tissue_genes[!Tissue %in% ref_tissue[["Tissue"]]][["Gene"]], ]),
stand = "center",
ndim = n_comp
)
pca_contrib <- pretty_pve(pca_res[["pve"]])
pca_dfxy <- detect_outliers(pca_res[["vectors"]], n_iqr, 1)
plot_list_samples_tissue <- plot_plane(
pca_contrib = pca_contrib,
pca_dfxy = pca_dfxy,
n_comp = n_comp
)
plot_0 <- patchwork::wrap_plots(
patchwork::guide_area(),
patchwork::wrap_plots(plot_list_samples, tag_level = "new", ncol = 1),
patchwork::wrap_plots(plot_list_tissue, tag_level = "new", ncol = 1),
patchwork::wrap_plots(plot_list_samples_tissue, tag_level = "new", ncol = 1),
nrow = 2,
ncol = 3,
widths = c(0.3, 0.4, 0.3),
heights = c(0.05, 0.95),
tag_level = "new",
guides = "collect",
design = "AAA\nBCD"
) +
patchwork::plot_annotation(
title = title,
subtitle = glue::glue(
"<b>A</b> Outliers<sup>1</sup> detection on regularised log transformed counts;<br>",
"<b>B</b> Tissue detection with <b>{tolower(tissue_ref)}</b> genes as reference<sup>2</sup>;<br>",
"<b>C</b> Outliers<sup>1</sup> detection using <b>",
'{tolower(glue::glue_collapse(tissue_genes[!Tissue %in% ref_tissue[["Tissue"]], unique(group)], sep = "</b>, <b>", last = "</b> and <b>"))}',
"</b> genes."
),
caption = glue::glue(
"<sup>1</sup>Outliers defined for a Euclidean distance from cohort centroid higher than {n_iqr} times the interquartile range above the 75<sup>th</sup> percentile,",
"based on <b>A</b> the principal components up to {n_comp} and <b>C</b> the first principal component.",
"<sup>2</sup>Tissue gene expression from '{data_file}'.",,
.sep = "<br>"
),
tag_levels = c("A", "1"),
theme = my_theme
)
} else {
plot_0 <- patchwork::wrap_plots(
patchwork::guide_area(),
patchwork::wrap_plots(plot_list_samples, tag_level = "new", ncol = 1),
patchwork::wrap_plots(plot_list_tissue, tag_level = "new", ncol = 1),
nrow = 2,
ncol = 2,
widths = c(0.40, 0.60),
heights = c(0.05, 0.95),
tag_level = "new",
guides = "collect",
design = "AA\nBC"
) +
patchwork::plot_annotation(
title = title,
subtitle = glue::glue(
"<b>A</b> Outliers<sup>1</sup> detection on regularised log transformed counts;<br>",
"<b>B</b> Tissue detection with <b>{tolower(tissue_ref)}</b> genes as reference<sup>2</sup>.",
),
caption = glue::glue(
"<sup>1</sup>Outliers defined for a Euclidean distance from cohort centroid higher than {n_iqr} times the interquartile range above the 75<sup>th</sup> percentile,",
"based on <b>A</b> the principal components up to {n_comp}.",
"<sup>2</sup>Tissue gene expression from '{data_file}'.",
.sep = "<br>"
),
tag_levels = c("A", "1"),
theme = my_theme
)
}
keep_samples <- pca_dfxy[is_outlier == "No", RNA_ID]
exclude_samples <- pca_dfxy[is_outlier == "Yes", RNA_ID]
if (length(tissue_genes_list) > 0) {
list(
plot = plot_0,
outliers = data.table::data.table(
RNA_ID = c(keep_samples, exclude_samples),
Tissue = c(rep(FALSE, length(keep_samples)), rep(TRUE, length(exclude_samples)))
)
)
} else {
list(
plot = plot_0,
outliers = data.table::data.table(
RNA_ID = c(keep_samples, exclude_samples),
Tissue = c(rep(FALSE, length(keep_samples)), rep(FALSE, length(exclude_samples)))
)
)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment