crazyhottommy/TCGA_recount3.rmd

## TCGA_recount3.rmd
```{r}

TAAs<- c("MSLN", "EGFR", "ERBB2", "CEACAM5", "NECTIN4", "EPCAM", "MUC16", "MUC1", "CD276",
         "FOLH1", "DLL3", "VTCN1", "PROM1", "PVR", "CLDN6", "MET", "FOLR1", "TNFRSF10B", "TACSTD2")
```

```{r}
TCGAExprsAndPheno2 <- function(gene_vector, cancer_types = "all", expression_type = "TPM"){
  library(recount3)
  ## Get all possible projects
  all_proj <- available_projects()
  tcga_proj <- all_proj %>% filter(file_source == "tcga")
  if (cancer_types != "all") {
    tcga_proj <- tcga_proj[tcga_proj$project %in% cancer_types,]
  }
  final_tcga <- NULL
  for (tcga_ind in seq(nrow(tcga_proj))) {
    proj_info <- tcga_proj[tcga_ind, c("project", "organism", "project_home")]
    rse_gene_temp<- create_rse(proj_info)
    count_matrix <- rse_gene_temp@assays@data$raw_counts
    ## Select gene ind
    if (!identical("all", gene_vector)) {
      gene_ind <- rse_gene_temp@rowRanges$gene_name %in% gene_vector
    } else {
      gene_ind <- seq(rse_gene_temp@rowRanges$gene_name)
    }

    if (expression_type == "TPM") {
      #### Creating TPM
      kilobase_length <- rse_gene_temp@rowRanges$bp_length
      reads_per_rpk <- count_matrix/kilobase_length
      per_mil_scale <- colSums(reads_per_rpk)/1000000
      tpm_matrix <- t(t(reads_per_rpk)/per_mil_scale)
      ## Make sure they match the ENSG and gene order
      exprs_vector <- tpm_matrix[gene_ind,]
    } else{
      #log2 normalize the count matrix
      total_reads<- colSums(count_matrix)
      normalized_counts<- log2(t(t(count_matrix)*10000000/total_reads) + 1)
      exprs_vector <- normalized_counts[gene_ind,]
    }
    tcga_data_frame <- rse_gene_temp@colData@listData |>
      as.data.frame()
    if (nrow(exprs_vector) > length(gene_vector)) {
      ind_vector <- rownames(exprs_vector)
    } else {
      ind_vector <- rse_gene_temp@rowRanges[gene_ind, ]$gene_name
      }
    tcga_data_frame[,ind_vector] <- exprs_vector
    tcga_data_frame <- select(tcga_data_frame, !!ind_vector, everything())
    final_tcga <- rbind(final_tcga, tcga_data_frame)
  }

  ## Group cancer data
  final_tcga_filt <- final_tcga %>%
    mutate(sample_type = case_when(
      tcga.cgc_sample_sample_type == "Additional - New Primary" ~ "cancer",
      tcga.cgc_sample_sample_type == "Additional Metastatic" ~ "metastatic",
      tcga.cgc_sample_sample_type == "Metastatic" ~ "metastatic",
      tcga.cgc_sample_sample_type == "Primary Blood Derived Cancer - Peripheral Blood " ~ "cancer",
      tcga.cgc_sample_sample_type == "Primary Tumor" ~ "cancer",
      tcga.cgc_sample_sample_type == "Recurrent Tumor" ~ "cancer",
      tcga.cgc_sample_sample_type == "Solid Tissue Normal" ~ "normal"

    ))
  if (startsWith(colnames(final_tcga_filt)[1], "ENSG")) {
    message("Multiple columns share gene names, returning ENSG")
  }
  return(final_tcga_filt)
}

```

```{r}

tcga_TAA <- TCGAExprsAndPheno2(gene_vector = c(TAAs, "CD24"),cancer_type="all")

tcga_df<- tcga_TAA %>%
  select(any_of(TAAs), CD24, tcga.tcga_barcode, sample_type, study) %>%
  group_by(sample_type, study) %>%
  summarise(across(1:20, ~log2(.x+1))) %>%
  summarise(across(1:20, median)) %>%
  arrange(study) %>%
  filter(!is.na(sample_type))

tcga_df<- tcga_df %>%
  filter(sample_type == "cancer")

tcga_mat<- tcga_df[, -c(1,2)] %>% as.matrix()

rownames(tcga_mat)<- tcga_df %>% pull(study)

cell_fun = function(j, i, x, y, w, h, fill) {
    grid.rect(x = x, y = y, width = w, height = h,
              gp = gpar(col = "black", fill = NA))
}


Heatmap(tcga_mat, cluster_columns = TRUE, cell_fun = cell_fun,
        name = "log2TPM")

```
	```{r}

	TAAs<- c("MSLN", "EGFR", "ERBB2", "CEACAM5", "NECTIN4", "EPCAM", "MUC16", "MUC1", "CD276",
	"FOLH1", "DLL3", "VTCN1", "PROM1", "PVR", "CLDN6", "MET", "FOLR1", "TNFRSF10B", "TACSTD2")
	```

	```{r}
	TCGAExprsAndPheno2 <- function(gene_vector, cancer_types = "all", expression_type = "TPM"){
	library(recount3)
	## Get all possible projects
	all_proj <- available_projects()
	tcga_proj <- all_proj %>% filter(file_source == "tcga")
	if (cancer_types != "all") {
	tcga_proj <- tcga_proj[tcga_proj$project %in% cancer_types,]
	}
	final_tcga <- NULL
	for (tcga_ind in seq(nrow(tcga_proj))) {
	proj_info <- tcga_proj[tcga_ind, c("project", "organism", "project_home")]
	rse_gene_temp<- create_rse(proj_info)
	count_matrix <- rse_gene_temp@assays@data$raw_counts
	## Select gene ind
	if (!identical("all", gene_vector)) {
	gene_ind <- rse_gene_temp@rowRanges$gene_name %in% gene_vector
	} else {
	gene_ind <- seq(rse_gene_temp@rowRanges$gene_name)
	}

	if (expression_type == "TPM") {
	#### Creating TPM
	kilobase_length <- rse_gene_temp@rowRanges$bp_length
	reads_per_rpk <- count_matrix/kilobase_length
	per_mil_scale <- colSums(reads_per_rpk)/1000000
	tpm_matrix <- t(t(reads_per_rpk)/per_mil_scale)
	## Make sure they match the ENSG and gene order
	exprs_vector <- tpm_matrix[gene_ind,]
	} else{
	#log2 normalize the count matrix
	total_reads<- colSums(count_matrix)
	normalized_counts<- log2(t(t(count_matrix)*10000000/total_reads) + 1)
	exprs_vector <- normalized_counts[gene_ind,]
	}
	tcga_data_frame <- rse_gene_temp@colData@listData \|>
	as.data.frame()
	if (nrow(exprs_vector) > length(gene_vector)) {
	ind_vector <- rownames(exprs_vector)
	} else {
	ind_vector <- rse_gene_temp@rowRanges[gene_ind, ]$gene_name
	}
	tcga_data_frame[,ind_vector] <- exprs_vector
	tcga_data_frame <- select(tcga_data_frame, !!ind_vector, everything())
	final_tcga <- rbind(final_tcga, tcga_data_frame)
	}

	## Group cancer data
	final_tcga_filt <- final_tcga %>%
	mutate(sample_type = case_when(
	tcga.cgc_sample_sample_type == "Additional - New Primary" ~ "cancer",
	tcga.cgc_sample_sample_type == "Additional Metastatic" ~ "metastatic",
	tcga.cgc_sample_sample_type == "Metastatic" ~ "metastatic",
	tcga.cgc_sample_sample_type == "Primary Blood Derived Cancer - Peripheral Blood " ~ "cancer",
	tcga.cgc_sample_sample_type == "Primary Tumor" ~ "cancer",
	tcga.cgc_sample_sample_type == "Recurrent Tumor" ~ "cancer",
	tcga.cgc_sample_sample_type == "Solid Tissue Normal" ~ "normal"

	))
	if (startsWith(colnames(final_tcga_filt)[1], "ENSG")) {
	message("Multiple columns share gene names, returning ENSG")
	}
	return(final_tcga_filt)
	}

	```

	```{r}

	tcga_TAA <- TCGAExprsAndPheno2(gene_vector = c(TAAs, "CD24"),cancer_type="all")

	tcga_df<- tcga_TAA %>%
	select(any_of(TAAs), CD24, tcga.tcga_barcode, sample_type, study) %>%
	group_by(sample_type, study) %>%
	summarise(across(1:20, ~log2(.x+1))) %>%
	summarise(across(1:20, median)) %>%
	arrange(study) %>%
	filter(!is.na(sample_type))

	tcga_df<- tcga_df %>%
	filter(sample_type == "cancer")

	tcga_mat<- tcga_df[, -c(1,2)] %>% as.matrix()

	rownames(tcga_mat)<- tcga_df %>% pull(study)

	cell_fun = function(j, i, x, y, w, h, fill) {
	grid.rect(x = x, y = y, width = w, height = h,
	gp = gpar(col = "black", fill = NA))
	}


	Heatmap(tcga_mat, cluster_columns = TRUE, cell_fun = cell_fun,
	name = "log2TPM")

	```