Skip to content

Instantly share code, notes, and snippets.

@crazyhottommy
Last active October 26, 2023 08:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save crazyhottommy/042aa268bbc8c6a4067545600da10d24 to your computer and use it in GitHub Desktop.
Save crazyhottommy/042aa268bbc8c6a4067545600da10d24 to your computer and use it in GitHub Desktop.
TCGA
```{r}
TAAs<- c("MSLN", "EGFR", "ERBB2", "CEACAM5", "NECTIN4", "EPCAM", "MUC16", "MUC1", "CD276",
"FOLH1", "DLL3", "VTCN1", "PROM1", "PVR", "CLDN6", "MET", "FOLR1", "TNFRSF10B", "TACSTD2")
```
```{r}
TCGAExprsAndPheno2 <- function(gene_vector, cancer_types = "all", expression_type = "TPM"){
library(recount3)
## Get all possible projects
all_proj <- available_projects()
tcga_proj <- all_proj %>% filter(file_source == "tcga")
if (cancer_types != "all") {
tcga_proj <- tcga_proj[tcga_proj$project %in% cancer_types,]
}
final_tcga <- NULL
for (tcga_ind in seq(nrow(tcga_proj))) {
proj_info <- tcga_proj[tcga_ind, c("project", "organism", "project_home")]
rse_gene_temp<- create_rse(proj_info)
count_matrix <- rse_gene_temp@assays@data$raw_counts
## Select gene ind
if (!identical("all", gene_vector)) {
gene_ind <- rse_gene_temp@rowRanges$gene_name %in% gene_vector
} else {
gene_ind <- seq(rse_gene_temp@rowRanges$gene_name)
}
if (expression_type == "TPM") {
#### Creating TPM
kilobase_length <- rse_gene_temp@rowRanges$bp_length
reads_per_rpk <- count_matrix/kilobase_length
per_mil_scale <- colSums(reads_per_rpk)/1000000
tpm_matrix <- t(t(reads_per_rpk)/per_mil_scale)
## Make sure they match the ENSG and gene order
exprs_vector <- tpm_matrix[gene_ind,]
} else{
#log2 normalize the count matrix
total_reads<- colSums(count_matrix)
normalized_counts<- log2(t(t(count_matrix)*10000000/total_reads) + 1)
exprs_vector <- normalized_counts[gene_ind,]
}
tcga_data_frame <- rse_gene_temp@colData@listData |>
as.data.frame()
if (nrow(exprs_vector) > length(gene_vector)) {
ind_vector <- rownames(exprs_vector)
} else {
ind_vector <- rse_gene_temp@rowRanges[gene_ind, ]$gene_name
}
tcga_data_frame[,ind_vector] <- exprs_vector
tcga_data_frame <- select(tcga_data_frame, !!ind_vector, everything())
final_tcga <- rbind(final_tcga, tcga_data_frame)
}
## Group cancer data
final_tcga_filt <- final_tcga %>%
mutate(sample_type = case_when(
tcga.cgc_sample_sample_type == "Additional - New Primary" ~ "cancer",
tcga.cgc_sample_sample_type == "Additional Metastatic" ~ "metastatic",
tcga.cgc_sample_sample_type == "Metastatic" ~ "metastatic",
tcga.cgc_sample_sample_type == "Primary Blood Derived Cancer - Peripheral Blood " ~ "cancer",
tcga.cgc_sample_sample_type == "Primary Tumor" ~ "cancer",
tcga.cgc_sample_sample_type == "Recurrent Tumor" ~ "cancer",
tcga.cgc_sample_sample_type == "Solid Tissue Normal" ~ "normal"
))
if (startsWith(colnames(final_tcga_filt)[1], "ENSG")) {
message("Multiple columns share gene names, returning ENSG")
}
return(final_tcga_filt)
}
```
```{r}
tcga_TAA <- TCGAExprsAndPheno2(gene_vector = c(TAAs, "CD24"),cancer_type="all")
tcga_df<- tcga_TAA %>%
select(any_of(TAAs), CD24, tcga.tcga_barcode, sample_type, study) %>%
group_by(sample_type, study) %>%
summarise(across(1:20, ~log2(.x+1))) %>%
summarise(across(1:20, median)) %>%
arrange(study) %>%
filter(!is.na(sample_type))
tcga_df<- tcga_df %>%
filter(sample_type == "cancer")
tcga_mat<- tcga_df[, -c(1,2)] %>% as.matrix()
rownames(tcga_mat)<- tcga_df %>% pull(study)
cell_fun = function(j, i, x, y, w, h, fill) {
grid.rect(x = x, y = y, width = w, height = h,
gp = gpar(col = "black", fill = NA))
}
Heatmap(tcga_mat, cluster_columns = TRUE, cell_fun = cell_fun,
name = "log2TPM")
```
@crazyhottommy
Copy link
Author

Screen Shot 2023-10-20 at 4 21 32 PM

@BiotechPedro
Copy link

Hi Tommy. Here is the code and the result of my analysis!

`library(cBioPortalData)
library(org.Hs.eg.db)
library(stringr)
library(ComplexHeatmap)

genes_of_interest <- c("MSLN", "EGFR", "ERBB2", "CEACAM5", "NECTIN4", "EPCAM",
"MUC16", "MUC1", "CD276", "FOLH1", "DLL3", "VTCN1",
"PROM1", "PVR", "CLDN6", "MET", "FOLR1", "TNFRSF10B", "TACSTD2")
cbio <- cBioPortal()
studies <- getStudies(cbio, buildReport = TRUE)
TCGA_studies <- studies[grep("PanCancer", studies$name), ]

df <- setNames(as.data.frame(lapply(TCGA_studies$studyId, function(study_ID) {
median_value <- unlist(lapply(genes_of_interest, function(gene_of_interest) {
entrez <- mapIds(org.Hs.eg.db, gene_of_interest, "ENTREZID", "SYMBOL")
mol_prof <- paste(study_ID, "rna_seq_v2_mrna", sep = "_" )
df <- molecularData(api = cbio, molecularProfileIds = mol_prof,
entrezGeneIds = entrez,
sampleIds = allSamples(cbio, study_ID)$sampleId)
result <- median(df[[mol_prof]]$value)
if (is.null(result)) {NA} else {result}
}))
setNames(median_value, genes_of_interest)
})), nm = TCGA_studies$studyId)

colnames(df) <- toupper(str_split(colnames(df), "_", simplify = TRUE)[, 1])
Heatmap(log(as.matrix(df) + 0.01), name = "log(RSEM)")`

TCGA

Best,

Peter

@crazyhottommy
Copy link
Author

Thanks, Peter! This result looks right to me, with MSLN high in MESO and FOLH1 high in PRAD.
I need to figure out what's wrong with my code working with recount3 data... likely the way I calculate TPM is wrong.

Thanks again!

Tommy

@crazyhottommy
Copy link
Author

crazyhottommy commented Oct 24, 2023

It seems the recount3 raw count is not right... I tried using a different method and the result (raw counts to TPM) is similar to the RSEM

a different data resource

library(ExperimentHub)
library(ComplexHeatmap)

eh = ExperimentHub()
query(eh , "GSE62944")


tcga_data<- eh[["EH1043"]]

colData(tcga_data)[1:5, 1:5]

count_mat<- assay(tcga_data)

count_mat[1:5, 1:5]

pheno_data<- phenoData(tcga_data)

get the gene length

#BiocManager::install("TxDb.Hsapiens.UCSC.hg19.knownGene")
#BiocManager::install("org.Hs.eg.db")

library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(org.Hs.eg.db)
library(readr)

txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
hg19_genes<- genes(txdb)
hg19_genes

# map the Entrez ID to gene symbol
gene_symbol<- AnnotationDbi::select(org.Hs.eg.db, keys=hg19_genes$gene_id, 
                                    columns="SYMBOL", keytype="ENTREZID")

all.equal(hg19_genes$gene_id, gene_symbol$ENTREZID)

hg19_genes$symbol<- gene_symbol$SYMBOL

hg19_genes

gene_df<- data.frame(EntrezID = hg19_genes$gene_id, 
                     Symbol = hg19_genes$symbol, 
                     Gene_length = width(hg19_genes))
# almost 3000 genes not in the dataframe..
table(rownames(count_mat) %in% gene_df$Symbol)

indx<- intersect(rownames(count_mat),  gene_df$Symbol)

setdiff(rownames(count_mat), gene_df$Symbol)
setdiff(gene_df$Symbol, rownames(count_mat))


count_mat<- count_mat[indx, ]

gene_df<- gene_df %>%
  slice(match(indx, Symbol))

dim(gene_df)

convert counts to TPM

count_to_tpm<- function(mat, gene_length){
  mat<- mat/gene_length
  total_per_sample<- colSums(mat)
  mat<- t(t(mat)/total_per_sample)
  return(mat*1000000)
}

tpm_mat<- count_to_tpm(count_mat, gene_df$Gene_length)

TAAs<- c("MSLN", "EGFR", "ERBB2", "CEACAM5", "EPCAM", "MUC16", "MUC1", "CD276",
         "FOLH1", "DLL3", "VTCN1", "PROM1", "PVR", "CLDN6", "MET", "FOLR1", "TNFRSF10B", "TACSTD2")

# can not find "NECTIN4", it might be a different name 
TAAs[!TAAs %in% rownames(tpm_mat)]

tpm_sub<- tpm_mat[TAAs, ]

tpm_median<- cbind(t(tpm_sub), CancerType = as.character(colData(tcga_data)$CancerType)) %>%
  as.data.frame() %>%
  mutate(across(1:18, as.numeric)) %>%
  group_by(CancerType) %>%
  summarise(across(1:18, median)) 

tpm_sub_mat<- as.matrix(tpm_median[,-1])
rownames(tpm_sub_mat)<- tpm_median$CancerType

col_fun<- circlize::colorRamp2(c(-2,0,2), colors = c("blue", "white", "red"))

Heatmap(t(scale(log2(tpm_sub_mat +1))), cluster_columns = TRUE, cell_fun = cell_fun,
        name = "log2TPM", col = col_fun)

image

@crazyhottommy
Copy link
Author

I rewrote the code, and now the results make sense:

#BiocManager::install("recount3")
library(recount3)
library(purrr)
human_projects <- available_projects()

tcga_info = subset(
    human_projects,
    file_source == "tcga" & project_type == "data_sources"
)

seq(nrow(tcga_info))

proj_info <- map(seq(nrow(tcga_info)), ~tcga_info[.x, ])


rse_tcga <- map(proj_info, ~create_rse(.x))


/Users/tommytang/Library/Caches/org.R-project.R/R/recount3
does not exist, create directory? (yes/no): yes

TAAs<- c("MSLN", "EGFR", "ERBB2", "CEACAM5", "NECTIN4", "EPCAM", "MUC16", "MUC1", "CD276", "FOLH1", "DLL3", "VTCN1", "PROM1", "PVR", "CLDN6", "MET", "FOLR1", "TNFRSF10B", "TACSTD2", "CD24")

#### Creating TPM
count2tpm<- function(rse){
        count_matrix <- rse@assays@data$raw_counts
        gene_length <- rse@rowRanges$bp_length
        reads_per_rpk <- count_matrix/gene_length
        per_mil_scale <- colSums(reads_per_rpk)/1000000
        tpm_matrix <- t(t(reads_per_rpk)/per_mil_scale)
         ## Make sure they match the ENSG and gene order
        gene_ind<-  rse@rowRanges$gene_name %in% TAAs 
        tpm_submatrix <- tpm_matrix[gene_ind,]
        rownames(tpm_submatrix)<- rse@rowRanges[gene_ind, ]$gene_name
        return(tpm_submatrix)
}


tpm_data<- map(rse_tcga, count2tpm)
metadata<- map(rse_tcga, ~.x@colData@listData %>% as.data.frame())

tpm_data2<- reduce(tpm_data, cbind)
metadata2<- reduce(metadata, rbind)
dim(tpm_data2)
dim(metadata2)
metadata2<- metadata2 %>%
        select(tcga.tcga_barcode, tcga.cgc_sample_sample_type, study) %>%
        mutate(sample_type = case_when(
                tcga.cgc_sample_sample_type == "Additional - New Primary" ~ "cancer",
                tcga.cgc_sample_sample_type == "Additional Metastatic" ~ "metastatic",
                tcga.cgc_sample_sample_type == "Metastatic" ~ "metastatic",
                tcga.cgc_sample_sample_type == "Primary Blood Derived Cancer - Peripheral Blood " ~ "cancer",
                tcga.cgc_sample_sample_type == "Primary Tumor" ~ "cancer",
                tcga.cgc_sample_sample_type == "Recurrent Tumor" ~ "cancer",
                tcga.cgc_sample_sample_type == "Solid Tissue Normal" ~ "normal"
    ))

final_df<- cbind(t(tpm_data2), metadata2)
library(ComplexHeatmap)
tcga_df<- final_df %>%
        filter(sample_type == "cancer") %>%
        group_by(sample_type, study) %>%
        summarise(across(1:20, ~log2(.x+1))) %>%
        summarise(across(1:20, median)) %>%
        arrange(study) %>%
        filter(!is.na(sample_type))

tcga_mat<- tcga_df[, -c(1,2)] %>% as.matrix()
rownames(tcga_mat)<- tcga_df %>% pull(study)

cell_fun = function(j, i, x, y, w, h, fill) {
    grid.rect(x = x, y = y, width = w, height = h, 
              gp = gpar(col = "black", fill = NA))
}

Heatmap(tcga_mat, cluster_columns = TRUE, cell_fun = cell_fun,
        name = "log2TPM")

You can also scale the expression for each gene across the cancer types

Heatmap(scale(tcga_mat), cluster_columns = TRUE, cell_fun = cell_fun,
        name = "scaled\nlog2TPM")

Rplot

@BiotechPedro
Copy link

That's great, Tommy!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment