Skip to content

Instantly share code, notes, and snippets.

@crazyhottommy
Last active October 26, 2023 08:46
Show Gist options
  • Save crazyhottommy/042aa268bbc8c6a4067545600da10d24 to your computer and use it in GitHub Desktop.
Save crazyhottommy/042aa268bbc8c6a4067545600da10d24 to your computer and use it in GitHub Desktop.
TCGA
```{r}
TAAs<- c("MSLN", "EGFR", "ERBB2", "CEACAM5", "NECTIN4", "EPCAM", "MUC16", "MUC1", "CD276",
"FOLH1", "DLL3", "VTCN1", "PROM1", "PVR", "CLDN6", "MET", "FOLR1", "TNFRSF10B", "TACSTD2")
```
```{r}
TCGAExprsAndPheno2 <- function(gene_vector, cancer_types = "all", expression_type = "TPM"){
library(recount3)
## Get all possible projects
all_proj <- available_projects()
tcga_proj <- all_proj %>% filter(file_source == "tcga")
if (cancer_types != "all") {
tcga_proj <- tcga_proj[tcga_proj$project %in% cancer_types,]
}
final_tcga <- NULL
for (tcga_ind in seq(nrow(tcga_proj))) {
proj_info <- tcga_proj[tcga_ind, c("project", "organism", "project_home")]
rse_gene_temp<- create_rse(proj_info)
count_matrix <- rse_gene_temp@assays@data$raw_counts
## Select gene ind
if (!identical("all", gene_vector)) {
gene_ind <- rse_gene_temp@rowRanges$gene_name %in% gene_vector
} else {
gene_ind <- seq(rse_gene_temp@rowRanges$gene_name)
}
if (expression_type == "TPM") {
#### Creating TPM
kilobase_length <- rse_gene_temp@rowRanges$bp_length
reads_per_rpk <- count_matrix/kilobase_length
per_mil_scale <- colSums(reads_per_rpk)/1000000
tpm_matrix <- t(t(reads_per_rpk)/per_mil_scale)
## Make sure they match the ENSG and gene order
exprs_vector <- tpm_matrix[gene_ind,]
} else{
#log2 normalize the count matrix
total_reads<- colSums(count_matrix)
normalized_counts<- log2(t(t(count_matrix)*10000000/total_reads) + 1)
exprs_vector <- normalized_counts[gene_ind,]
}
tcga_data_frame <- rse_gene_temp@colData@listData |>
as.data.frame()
if (nrow(exprs_vector) > length(gene_vector)) {
ind_vector <- rownames(exprs_vector)
} else {
ind_vector <- rse_gene_temp@rowRanges[gene_ind, ]$gene_name
}
tcga_data_frame[,ind_vector] <- exprs_vector
tcga_data_frame <- select(tcga_data_frame, !!ind_vector, everything())
final_tcga <- rbind(final_tcga, tcga_data_frame)
}
## Group cancer data
final_tcga_filt <- final_tcga %>%
mutate(sample_type = case_when(
tcga.cgc_sample_sample_type == "Additional - New Primary" ~ "cancer",
tcga.cgc_sample_sample_type == "Additional Metastatic" ~ "metastatic",
tcga.cgc_sample_sample_type == "Metastatic" ~ "metastatic",
tcga.cgc_sample_sample_type == "Primary Blood Derived Cancer - Peripheral Blood " ~ "cancer",
tcga.cgc_sample_sample_type == "Primary Tumor" ~ "cancer",
tcga.cgc_sample_sample_type == "Recurrent Tumor" ~ "cancer",
tcga.cgc_sample_sample_type == "Solid Tissue Normal" ~ "normal"
))
if (startsWith(colnames(final_tcga_filt)[1], "ENSG")) {
message("Multiple columns share gene names, returning ENSG")
}
return(final_tcga_filt)
}
```
```{r}
tcga_TAA <- TCGAExprsAndPheno2(gene_vector = c(TAAs, "CD24"),cancer_type="all")
tcga_df<- tcga_TAA %>%
select(any_of(TAAs), CD24, tcga.tcga_barcode, sample_type, study) %>%
group_by(sample_type, study) %>%
summarise(across(1:20, ~log2(.x+1))) %>%
summarise(across(1:20, median)) %>%
arrange(study) %>%
filter(!is.na(sample_type))
tcga_df<- tcga_df %>%
filter(sample_type == "cancer")
tcga_mat<- tcga_df[, -c(1,2)] %>% as.matrix()
rownames(tcga_mat)<- tcga_df %>% pull(study)
cell_fun = function(j, i, x, y, w, h, fill) {
grid.rect(x = x, y = y, width = w, height = h,
gp = gpar(col = "black", fill = NA))
}
Heatmap(tcga_mat, cluster_columns = TRUE, cell_fun = cell_fun,
name = "log2TPM")
```
@BiotechPedro
Copy link

That's great, Tommy!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment