|
# The anova analysis for RAP1A |
|
# Made hastily once we deemed Deseq2 not the best for the job |
|
# Therefore, it is sloppy and bad. Sorry to whomever has to read this. |
|
|
|
library(tidyverse) |
|
library(cowplot) # Plot grids |
|
library(ggsignif) # Significance bars on the plots |
|
library(PMCMRplus) # Dunn test |
|
|
|
# The expression of the two genes |
|
data <- read_tsv("./data/TTG_filtered_data") |
|
|
|
# Metadata for TCGA and GTEx |
|
clinical_data <- read_csv("./data/clean_clinical_metadata.csv") |
|
gtex_metadata <- read_tsv("./data/GTEX_phenotype") |
|
|
|
#### << Functions >> ### |
|
|
|
#' Make simpler clinical metadata and filtering for the different cohorts |
|
preprocess_metadata <- function(clinical_data) { |
|
# Get only the cols of interest |
|
metadata <- clinical_data[, c("tumor_id", "submitter_id")] |
|
|
|
# Generate a new column "status_metastatic" |
|
status <- rep("non_metastatic", length(clinical_data$tumor_id)) |
|
|
|
figo_stage <- as.character(clinical_data$figo_stage) |
|
figo_tf <- startsWith(figo_stage, "Stage III") | startsWith(figo_stage, "Stage IV") |
|
figo_tf[is.na(figo_tf)] <- FALSE |
|
# The levels to select were inspected manually. |
|
status_key <- |
|
! clinical_data$ajcc_clinical_m %in% c(NA, "M0", "MX") | |
|
! clinical_data$ajcc_pathologic_m %in% c(NA, "cM0 (i+)", "M0", "MX") | |
|
clinical_data$ajcc_clinical_stage %in% c("Stage IV", "Stage IVA", "Stage IVB", "Stage IVC") | |
|
figo_tf |
|
|
|
status[status_key] <- "metastatic" |
|
|
|
metadata$status_metastatic <- factor(status, c("metastatic", "non_metastatic")) |
|
|
|
# Generate a new column "status_limph" |
|
status <- rep("primary_tumor", length(clinical_data$tumor_id)) |
|
|
|
figo_stage <- as.character(clinical_data$figo_stage) |
|
figo_tf <- startsWith(figo_stage, "Stage III") | startsWith(figo_stage, "Stage IV") |
|
figo_tf[is.na(figo_tf)] <- FALSE |
|
# The levels to select were inspected manually. |
|
status_key <- |
|
! clinical_data$ajcc_clinical_m %in% c(NA, "M0", "MX") | |
|
! clinical_data$ajcc_pathologic_m %in% c(NA, "cM0 (i+)", "M0", "MX") | |
|
! clinical_data$ajcc_clinical_n %in% c(NA, "N0", "NX") | |
|
! clinical_data$ajcc_pathologic_n %in% c(NA, "N0", "N0 (i-)", "N0 (i+)", "N0 (mol+)", "NX") | |
|
clinical_data$ajcc_clinical_stage %in% c("Stage IV", "Stage IVA", "Stage IVB", "Stage IVC") | |
|
figo_tf |
|
|
|
status[status_key] <- "limph_nodes+metastatic" |
|
metadata$status_limph <- factor(status, c("limph_nodes+metastatic", "primary_tumor")) |
|
|
|
return(metadata) |
|
} |
|
|
|
#' Trunc the TCGA ID to just the patient id |
|
trunc_to_pid <- function(x) { |
|
return(str_trunc(x, 12, side = "right", ellipsis = "")) |
|
} |
|
|
|
#' Get the metadata from the whole metadata set from the set of IDs. |
|
get_metadata_from_id <- function(x) { |
|
if (startsWith(x, "GTEX")) { |
|
if (! x %in% gtex_metadata$Sample) {print(paste("Cannot find", x)); return(NULL)} |
|
return(list(x, gtex_metadata$`_primary_site`[gtex_metadata$Sample == x], "healthy", "healthy")) |
|
} |
|
if (startsWith(x, "TCGA")) { |
|
if (! trunc_to_pid(x) %in% simple_clinical_data$submitter_id) {print(paste("Cannot find", x)); return(NULL)} |
|
return(list( |
|
x, |
|
simple_clinical_data$tumor_id[simple_clinical_data$submitter_id == trunc_to_pid(x)], |
|
as.character(simple_clinical_data$status_metastatic[simple_clinical_data$submitter_id == trunc_to_pid(x)]), |
|
as.character(simple_clinical_data$status_limph[simple_clinical_data$submitter_id == trunc_to_pid(x)]) |
|
)) |
|
} |
|
} |
|
|
|
#' Bind the result of `get_metadata_from_id` together. |
|
bind <- function(x, y) { |
|
if (typeof(x) == "list") { |
|
x <- unlist(x) |
|
} |
|
y <- unlist(y) |
|
|
|
if (startsWith(y[3], "TCGA")) {print(y)} |
|
return(rbind(x, y)) |
|
} |
|
|
|
calc_median_diff <- function(data, x, y) { |
|
# We do x - y |
|
return(median(data$exprs[data$status == x]) - median(data$exprs[data$status == y])) |
|
} |
|
|
|
get_diff <- function(data, formula) { |
|
splif <- unlist(str_split(formula, "-")) |
|
return(calc_median_diff(data, splif[1], splif[2])) |
|
} |
|
|
|
#' Analyze some data with an exprs col and a status col |
|
analyze <- function(data) { |
|
# Numerosity |
|
table(data$source, data$status) |> print() |
|
|
|
# Normality of the data |
|
print("NORMALITY") |
|
data[, c("exprs", "status")] |> group_by(status) |> |
|
group_map(\(x, y) {ks.test(x$exprs, "rnorm") |> print(); ks.test(x$exprs, "rnorm")$p.value}) |> |
|
unlist() -> ks.pvals |
|
|
|
if (any(ks.pvals < 0.05 & table(data$status) < 30)) { |
|
print("Running in non-parametric mode.") |
|
kru.mod <- kruskal.test(exprs ~ status, data = data) |
|
dunn_res <- kwAllPairsDunnTest(exprs ~ status, data = data, p.adjust.method = "bonferroni") |
|
|
|
m <- dunn_res$p.value |
|
dunn_rnms <- paste0(rownames(m)[row(m)[lower.tri(m, diag = TRUE)]], "-", colnames(m)[col(m)[lower.tri(m, diag = TRUE)]]) |
|
dunn_rest <- data.frame( |
|
`p.adj` = m[lower.tri(m, diag = TRUE)], |
|
row.names = dunn_rnms |
|
) |
|
|
|
# Calculate differences in the median |
|
dunn_rest$diff <- sapply(dunn_rnms, get_diff, data = data) |
|
|
|
print(as.data.frame(dunn_rest)) |
|
|
|
return(as.data.frame(dunn_rest)) |
|
|
|
} else { |
|
# Make anova model |
|
anov.mod <- aov(exprs ~ status, data = data) |
|
|
|
# Variance of the data |
|
print("VARIANCE") |
|
plot(anov.mod, 1) |> print() |
|
|
|
# summary of anova |
|
summary(anov.mod) |> print() |
|
|
|
# Post test |
|
tukey_res <- TukeyHSD(anov.mod) |
|
tukey_res |> print() |
|
tukey_res |> plot() |
|
|
|
colnames(tukey_res$status) <- c("diff", "lwr", "upr", "p.adj") |
|
return(as.data.frame(tukey_res$status)) |
|
} |
|
|
|
} |
|
|
|
# Deanonimize for pipes |
|
to_frame <- \(x) {return(data.frame(sample_id = x[,1], source = x[,2], status_metastatic = x[,3], status_limph = x[,4]))} |
|
omit_null <- \(x) {return(x[! sapply(x, is.null)])} |
|
prreduce <- purrr::reduce |
|
|
|
prep_for_analysis <- function(x, stat_type) { |
|
data.frame(exprs = x$exprs, source = x$source, status = as.factor(x[, stat_type])) |
|
} |
|
|
|
#### <<< END Functions >>> ### |
|
|
|
simple_clinical_data <- preprocess_metadata(clinical_data) |
|
|
|
all_colnames <- colnames(data) |
|
# We clear out the `TARGET` Ids - we don't need them |
|
intr_colnames <- all_colnames[! startsWith(all_colnames, "TARGET")] |
|
|
|
# We make the metadata for our samples |
|
intr_colnames |> map(get_metadata_from_id) |> |
|
omit_null() |> prreduce(bind) |> to_frame() |> |
|
mutate(status_metastatic = as.factor(status_metastatic), status_limph = as.factor(status_limph)) |> |
|
remove_rownames() -> metadata |
|
|
|
data |> remove_rownames() |> column_to_rownames("sample") -> data |
|
|
|
# Ready for the analysis |
|
|
|
RAP1 <- data["RAP1A", , drop=FALSE] |
|
TRPM8 <- data["TRPM8", , drop=FALSE] |
|
|
|
# Reshape |
|
RAP1r <- data.frame(exprs = unlist(RAP1[1,])) |
|
rownames(RAP1r) <- colnames(RAP1) |
|
|
|
TRPMr <- data.frame(exprs = unlist(TRPM8[1,])) |
|
rownames(TRPMr) <- colnames(TRPM8) |
|
|
|
# Merge with metadata |
|
merge(TRPMr, metadata, by.x = 0, by.y = "sample_id") |> |
|
remove_rownames() |> column_to_rownames("Row.names") -> TRPMrm |
|
|
|
merge(RAP1r, metadata, by.x = 0, by.y = "sample_id") |> |
|
remove_rownames() |> column_to_rownames("Row.names") -> RAP1rm |
|
|
|
results <- list() |
|
|
|
# Running on the "meta/non meta" labelling |
|
# Run this one at a time to check for assumptions (plots + messages) |
|
RAP1rm |> filter(source == "Breast" | source == "TCGA-BRCA") |> |
|
prep_for_analysis("status_metastatic") |> analyze() -> results$RBRCA_meta |
|
RAP1rm |> filter(source == "Cervix Uteri" | source == "Uterus" | source == "TCGA-UCEC") |> |
|
prep_for_analysis("status_metastatic") |> analyze() -> results$RUCEC_meta |
|
RAP1rm |> filter(source == "Prostate" | source == "TCGA-PRAD") |> |
|
prep_for_analysis("status_metastatic") |> analyze() -> results$RPRAD_meta |
|
|
|
TRPMrm |> filter(source == "Breast" | source == "TCGA-BRCA") |> |
|
prep_for_analysis("status_metastatic") |> analyze() -> results$TBRCA_meta |
|
TRPMrm |> filter(source == "Cervix Uteri" | source == "Uterus" | source == "TCGA-UCEC") |> |
|
prep_for_analysis("status_metastatic") |> analyze() -> results$TUCEC_meta |
|
TRPMrm |> filter(source == "Prostate" | source == "TCGA-PRAD") |> |
|
prep_for_analysis("status_metastatic") |> analyze() -> results$TPRAD_meta |
|
|
|
# Running on the "limph/non limph" labelling |
|
RAP1rm |> filter(source == "Breast" | source == "TCGA-BRCA") |> |
|
prep_for_analysis("status_limph") |> analyze() -> results$RBRCA_limph |
|
RAP1rm |> filter(source == "Cervix Uteri" | source == "Uterus" | source == "TCGA-UCEC") |> |
|
prep_for_analysis("status_limph") |> analyze() -> results$RUCEC_limph |
|
RAP1rm |> filter(source == "Prostate" | source == "TCGA-PRAD") |> |
|
prep_for_analysis("status_limph") |> analyze() -> results$RPRAD_limph |
|
|
|
TRPMrm |> filter(source == "Breast" | source == "TCGA-BRCA") |> |
|
prep_for_analysis("status_limph") |> analyze() -> results$TBRCA_limph |
|
TRPMrm |> filter(source == "Cervix Uteri" | source == "Uterus" | source == "TCGA-UCEC") |> |
|
prep_for_analysis("status_limph") |> analyze() -> results$TUCEC_limph |
|
TRPMrm |> filter(source == "Prostate" | source == "TCGA-PRAD") |> |
|
prep_for_analysis("status_limph") |> analyze() -> results$TPRAD_limph |
|
|
|
|
|
### <<< Plotting and tabulating results >>> ### |
|
# Add significance |
|
sig <- function(pval) { |
|
if (pval < 0.001) {"***"} else if (pval < 0.01) {"**"} else if (pval < 0.05) {"*"} else {"-"} |
|
} |
|
|
|
add_sig <- function(x) { |
|
x$sig <- sapply(x$`p.adj`, sig) |
|
return(x) |
|
} |
|
|
|
results2 <- lapply(results, add_sig) |
|
|
|
# See a summary |
|
see_res <- function() { |
|
oo <- options(digits = 3) |
|
on.exit(options(oo)) |
|
|
|
sr <- function(x) { |
|
# Select and round down |
|
x |> mutate(across(everything(), format(digits = 3))) |
|
} |
|
|
|
print("RAP1A - PRAD - META") |
|
results2$RPRAD_meta[, c("diff", "p.adj", "sig")] |> print() |
|
cat("\n") |
|
print("TRPM8 - PRAD - META") |
|
results2$TPRAD_meta[, c("diff", "p.adj", "sig")] |> print() |
|
cat("\n") |
|
|
|
print("RAP1A - BRCA - META") |
|
results2$RBRCA_meta[, c("diff", "p.adj", "sig")] |> print() |
|
cat("\n") |
|
print("TRPM8 - BRCA - META") |
|
results2$TBRCA_meta[, c("diff", "p.adj", "sig")] |> print() |
|
cat("\n") |
|
|
|
print("RAP1A - UCEC - META") |
|
results2$RUCEC_meta[, c("diff", "p.adj", "sig")] |> print() |
|
cat("\n") |
|
print("TRPM8 - UCEC - META") |
|
results2$TUCEC_meta[, c("diff", "p.adj", "sig")] |> print() |
|
cat("\n") |
|
cat("---------------\n") |
|
|
|
ord <- c(2, 1, 3) |
|
|
|
print("RAP1A - PRAD - LIMPH") |
|
results2$RPRAD_limph[ord, c("diff", "p.adj", "sig")] |> print() |
|
cat("\n") |
|
print("TRPM8 - PRAD - LIMPH") |
|
results2$TPRAD_limph[ord, c("diff", "p.adj", "sig")] |> print() |
|
cat("\n") |
|
|
|
print("RAP1A - BRCA - LIMPH") |
|
results2$RBRCA_limph[ord, c("diff", "p.adj", "sig")] |> print() |
|
cat("\n") |
|
print("TRPM8 - BRCA - LIMPH") |
|
results2$TBRCA_limph[ord, c("diff", "p.adj", "sig")] |> print() |
|
cat("\n") |
|
|
|
print("RAP1A - UCEC - LIMPH") |
|
results2$RUCEC_limph[ord, c("diff", "p.adj", "sig")] |> print() |
|
cat("\n") |
|
print("TRPM8 - UCEC - LIMPH") |
|
results2$TUCEC_limph[ord, c("diff", "p.adj", "sig")] |> print() |
|
cat("\n") |
|
|
|
} |
|
|
|
see_res() |
|
|
|
# For some reason this is needed or the plot saving later fails. |
|
graphics.off() |
|
|
|
|
|
### PLOTS |
|
bplot <- function( |
|
data, title, |
|
extra.colours = c("#048ab3", "lightblue"), comps = c(TRUE, TRUE, TRUE), addn = TRUE |
|
) { |
|
give.n <- function(x){ |
|
return(c(y = min(data$exprs), label = length(x))) |
|
} |
|
|
|
# Reorder data groups |
|
if ("non_metastatic" %in% levels(data$status)) { |
|
data$status <- factor(data$status, levels = c("healthy", "non_metastatic", "metastatic")) |
|
all_comps <- list(c("healthy", "metastatic"), c("healthy", "non_metastatic"), c("non_metastatic", "metastatic")) |
|
} else { |
|
data$status <- factor(data$status, levels = c("healthy", "primary_tumor", "limph_nodes+metastatic")) |
|
all_comps <- list(c("healthy", "primary_tumor"), c("healthy", "limph_nodes+metastatic"), c("primary_tumor", "limph_nodes+metastatic")) |
|
} |
|
|
|
p <- ggplot(data = data, aes(x = status, y = exprs, fill = status)) + |
|
scale_fill_manual(values = c("lightgray", extra.colours)) + |
|
geom_violin() + |
|
geom_boxplot(width = 0.1, outlier.color = "#00ff1a", outlier.alpha = 0.5, outlier.size = 0.8, outlier.shape = 4) + |
|
ggtitle(title) + |
|
ylab("Normalized Expression (log2)") + |
|
xlab("Status") + |
|
theme_bw() + |
|
theme(legend.position = "none", plot.title = element_text(size = 10), axis.title = element_text(size = 8)) + |
|
geom_signif( |
|
comparisons = all_comps[comps], |
|
map_signif_level = TRUE, |
|
step_increase = 0.1, |
|
vjust = 0.5 |
|
) |
|
|
|
if (addn) { |
|
p <- p + |
|
stat_summary( |
|
fun.data = give.n, geom = "text", |
|
position = position_nudge(y = - 0.5), size = 2.5, |
|
colour = "black", |
|
) |
|
} |
|
|
|
return(p) |
|
} |
|
|
|
|
|
regenerate_plots <- function(stat_type, code) { |
|
print(stat_type) |
|
print(code) |
|
plots <- list() |
|
RAP1rm |> filter(source == "Prostate" | source == "TCGA-PRAD") |> |
|
prep_for_analysis(stat_type) |> |
|
bplot("RAP1A - Prostate Dataset", c("#c40000", "#e83e00"), c(FALSE, FALSE, FALSE)) -> plots$RPRO |
|
TRPMrm |> filter(source == "Prostate" | source == "TCGA-PRAD") |> |
|
prep_for_analysis(stat_type) |> |
|
bplot("TRPM8 - Prostate Dataset", c("#c40000", "#e83e00"), c(FALSE, TRUE, FALSE)) -> plots$TPRO |
|
|
|
RAP1rm |> filter(source == "Breast" | source == "TCGA-BRCA") |> |
|
prep_for_analysis(stat_type) |> |
|
bplot("RAP1A - Breast Dataset", comps = c(FALSE, FALSE, FALSE)) -> plots$RBRE |
|
TRPMrm |> filter(source == "Breast" | source == "TCGA-BRCA") |> |
|
prep_for_analysis(stat_type) |> |
|
bplot("TRPM8 - Breast Dataset", comps = c(FALSE, TRUE, FALSE)) -> plots$TBRE |
|
|
|
RAP1rm |> filter(source == "Cervix Uteri" | source == "Uterus" | source == "TCGA-UCEC") |> |
|
prep_for_analysis(stat_type) |> |
|
bplot("RAP1A - Uterus Dataset", c("#8f3199", "#bf41cc"), c(TRUE, TRUE, FALSE)) -> plots$RUTE |
|
TRPMrm |> filter(source == "Cervix Uteri" | source == "Uterus" | source == "TCGA-UCEC") |> |
|
prep_for_analysis(stat_type) |> |
|
bplot("TRPM8 - Uterus Dataset", c("#8f3199", "#bf41cc"), c(TRUE, TRUE, FALSE)) -> plots$TUTE |
|
|
|
pdf(file = paste0('./results/', code, '_results.pdf'), height = 11, width = 9) |
|
tiff(file = paste0('./results/', code, '_results.tiff'), height = 6600, width = 5400, res = 600) |
|
png(file = paste0('./results/', code, '_results.tiff'), height = 6600, width = 5400, res = 600) |
|
for (i in 1:3) { |
|
print(i) |
|
print(dev.cur()) |
|
plot_grid( |
|
plots$RPRO, plots$TPRO, |
|
plots$RBRE, plots$TBRE, |
|
plots$RUTE, plots$TUTE, |
|
ncol = 2 |
|
) |> print() |
|
# Cycle through the devices |
|
dev.set(dev.prev()) |
|
} |
|
graphics.off() |
|
} |
|
|
|
regenerate_all_plots <- function() { |
|
regenerate_plots("status_limph", "LIMPH") |
|
regenerate_plots("status_metastatic", "NOLIMPH") |
|
} |
|
|
|
regenerate_all_plots() |
|
|
|
## EXTRAS ## |
|
# See the TRPM8 zero counts. Not exactly zero, but less than 1 |
|
|
|
count_n <- function(x, y, fun) { |
|
matching <- sum(fun(x$exprs)) |
|
percent <- (matching / length(x$exprs)) * 100 |
|
print(paste0(y$status[1], ": ", matching, " - ", round(percent, 2), "%")) |
|
} |
|
|
|
TRPMrm |> filter(source == "Cervix Uteri" | source == "Uterus" | source == "TCGA-UCEC") |> |
|
group_by(status) |> group_walk(count_n, fun = \(x) {x < 1}) |
|
|
|
TRPMrm |> filter(source == "Breast" | source == "TCGA-BRCA") |> |
|
group_by(status) |> group_walk(count_n, fun = \(x) {x < 1}) |