arraytools/TCGAbiolinks-GBM.Rmd

## TCGAbiolinks-GBM.Rmd
Download Raw counts
```{r}
library(TCGAbiolinks)

proj <- "TCGA-GBM"
query <- GDCquery(
    project = proj,
    data.category = "Transcriptome Profiling",
    data.type = "Gene Expression Quantification",
    workflow.type = "STAR - Counts"
)
GDCdownload(query)
# Downloading data for project TCGA-GBM
# Of the 175 files for download 2 already exist.
# We will download only those that are missing ones.
# GDCdownload will download 173 files. A total of 733.697085 MB
# Downloading as: Sun_Mar_10_16_33_45_2024.tar.gz
data <- GDCprepare(query)  #  60660   175
class(data)
# [1] "RangedSummarizedExperiment"
# attr(,"package")
# [1] "SummarizedExperiment"
```

Remove samples with NAs annotation
```{r}
data <- data[,!is.na(data$paper_IDH.status)]
dim(data)
# [1] 60660   153

data
# class: RangedSummarizedExperiment
# dim: 60660 153
# metadata(1): data_release
# assays(6): unstranded stranded_first ... fpkm_unstrand fpkm_uq_unstrand
# rownames(60660): ENSG00000000003.15 ENSG00000000005.6 ...
#   ENSG00000288674.1 ENSG00000288675.1
# rowData names(10): source type ... hgnc_id havana_gene
# colnames(153): TCGA-06-5410-01A-01R-1849-01 TCGA-19-5960-01A-11R-1850-01
#   ... TCGA-06-0187-01A-01R-1849-01 TCGA-27-2519-01A-01R-1850-01
# colData names(109): barcode patient ...
#   paper_Telomere.length.estimate.in.blood.normal..Kb.
#   paper_Telomere.length.estimate.in.tumor..Kb.

colData(data) |> dim()
# [1] 153 109
table(colData(data)$paper_IDH.status)
# Mutant     WT
#     11    142

assay(data)[1:5, 1:2]
#                    TCGA-06-5410-01A-01R-1849-01 TCGA-19-5960-01A-11R-1850-01
# ENSG00000000003.15                         2203                         6187
# ENSG00000000005.6                             3                           20
# ENSG00000000419.13                         1509                         1608
# ENSG00000000457.14                          478                          835
# ENSG00000000460.17                          357                          594
```

Perform DE analysis
```{r DESeq2}
library(DESeq2)

ddsSE <- DESeqDataSet(data, design = ~ paper_IDH.status)
keep <- rowSums(counts(ddsSE)) >= 10
ddsSE <- ddsSE[keep,]
ddsSE <- DESeq(ddsSE)
# estimating size factors
# estimating dispersions
# gene-wise dispersion estimates
# mean-dispersion relationship
# final dispersion estimates
# fitting model and testing
# -- replacing outliers and refitting for 2134 genes
# -- DESeq argument 'minReplicatesForReplace' = 7
# -- original counts are preserved in counts(dds)
# estimating dispersions
# fitting model and testing
resultsNames(ddsSE)
# 1] "Intercept"                     "paper_IDH.status_WT_vs_Mutant"

res <- results(ddsSE, name = "paper_IDH.status_WT_vs_Mutant")
dea <- as.data.frame(res)
summary(res)
# out of 47965 with nonzero total read count
# adjusted p-value < 0.1
# LFC > 0 (up)       : 3464, 7.2%
# LFC < 0 (down)     : 2821, 5.9%
# outliers [1]       : 0, 0%
# low counts [2]     : 13020, 27%
# (mean count < 1)
# [1] see 'cooksCutoff' argument of ?results
# [2] see 'independentFiltering' argument of ?results

dim(dea)
# [1] 47966     6
dea[1:3, ] |> round(3)
#                    baseMean log2FoldChange lfcSE  stat pvalue  padj
# ENSG00000000003.15 4224.253          0.327 0.214 1.531  0.126 0.342
# ENSG00000000005.6    17.369          1.293 0.530 2.437  0.015 0.088
# ENSG00000000419.13 1246.046          0.043 0.140 0.307  0.759 0.888

```

[DeSeq2 results converting ENSG IDs to Gene Symbols, more than one ENSG ID per Gene symbol](https://support.bioconductor.org/p/105430/)
```{r GeneSymbol}
if (FALSE) {
  # NOT WORKING
  # https://stackoverflow.com/a/28544212
  library('biomaRt')
  mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))

  genes <- c("ENSG00000260727", "ENSG00000277521", "ENSG00000116514")

  G_list <- getBM(filters = "ensembl_gene_id",
                  attributes = c("ensembl_gene_id", "external_gene_name"),
                  values = genes,
                  mart = mart)
}
library(org.Hs.eg.db)
library(AnnotationDbi)
# mapIds(org.Hs.eg.db,
#           keys=genes,
#           column="SYMBOL", keytype="ENSEMBL", multiVals="first")

dea$symbol <- mapIds(org.Hs.eg.db,
            keys=substr(rownames(dea), 1, 15),
            column="SYMBOL", keytype="ENSEMBL", multiVals="first")
```

```{r ReduceDuplicated}
length(unique(dea$symbol))
# 29322

# keep the one with the largest overall expression value.
o <- order(dea$baseMean, decreasing=TRUE)
dea2 <- dea[o,]
dea3 <- dea2[!duplicated(dea2$symbol),]
dim(dea3)
# 29322     7

sum(is.na(dea3$symbol))
# [1] 1
dea4 <- dea3[! is.na(dea3$symbol), ]
dim(dea4)
# 29321     7
```
	Download Raw counts
	```{r}
	library(TCGAbiolinks)

	proj <- "TCGA-GBM"
	query <- GDCquery(
	project = proj,
	data.category = "Transcriptome Profiling",
	data.type = "Gene Expression Quantification",
	workflow.type = "STAR - Counts"
	)
	GDCdownload(query)
	# Downloading data for project TCGA-GBM
	# Of the 175 files for download 2 already exist.
	# We will download only those that are missing ones.
	# GDCdownload will download 173 files. A total of 733.697085 MB
	# Downloading as: Sun_Mar_10_16_33_45_2024.tar.gz
	data <- GDCprepare(query) # 60660 175
	class(data)
	# [1] "RangedSummarizedExperiment"
	# attr(,"package")
	# [1] "SummarizedExperiment"
	```

	Remove samples with NAs annotation
	```{r}
	data <- data[,!is.na(data$paper_IDH.status)]
	dim(data)
	# [1] 60660 153

	data
	# class: RangedSummarizedExperiment
	# dim: 60660 153
	# metadata(1): data_release
	# assays(6): unstranded stranded_first ... fpkm_unstrand fpkm_uq_unstrand
	# rownames(60660): ENSG00000000003.15 ENSG00000000005.6 ...
	# ENSG00000288674.1 ENSG00000288675.1
	# rowData names(10): source type ... hgnc_id havana_gene
	# colnames(153): TCGA-06-5410-01A-01R-1849-01 TCGA-19-5960-01A-11R-1850-01
	# ... TCGA-06-0187-01A-01R-1849-01 TCGA-27-2519-01A-01R-1850-01
	# colData names(109): barcode patient ...
	# paper_Telomere.length.estimate.in.blood.normal..Kb.
	# paper_Telomere.length.estimate.in.tumor..Kb.

	colData(data) \|> dim()
	# [1] 153 109
	table(colData(data)$paper_IDH.status)
	# Mutant WT
	# 11 142

	assay(data)[1:5, 1:2]
	# TCGA-06-5410-01A-01R-1849-01 TCGA-19-5960-01A-11R-1850-01
	# ENSG00000000003.15 2203 6187
	# ENSG00000000005.6 3 20
	# ENSG00000000419.13 1509 1608
	# ENSG00000000457.14 478 835
	# ENSG00000000460.17 357 594
	```

	Perform DE analysis
	```{r DESeq2}
	library(DESeq2)

	ddsSE <- DESeqDataSet(data, design = ~ paper_IDH.status)
	keep <- rowSums(counts(ddsSE)) >= 10
	ddsSE <- ddsSE[keep,]
	ddsSE <- DESeq(ddsSE)
	# estimating size factors
	# estimating dispersions
	# gene-wise dispersion estimates
	# mean-dispersion relationship
	# final dispersion estimates
	# fitting model and testing
	# -- replacing outliers and refitting for 2134 genes
	# -- DESeq argument 'minReplicatesForReplace' = 7
	# -- original counts are preserved in counts(dds)
	# estimating dispersions
	# fitting model and testing
	resultsNames(ddsSE)
	# 1] "Intercept" "paper_IDH.status_WT_vs_Mutant"

	res <- results(ddsSE, name = "paper_IDH.status_WT_vs_Mutant")
	dea <- as.data.frame(res)
	summary(res)
	# out of 47965 with nonzero total read count
	# adjusted p-value < 0.1
	# LFC > 0 (up) : 3464, 7.2%
	# LFC < 0 (down) : 2821, 5.9%
	# outliers [1] : 0, 0%
	# low counts [2] : 13020, 27%
	# (mean count < 1)
	# [1] see 'cooksCutoff' argument of ?results
	# [2] see 'independentFiltering' argument of ?results

	dim(dea)
	# [1] 47966 6
	dea[1:3, ] \|> round(3)
	# baseMean log2FoldChange lfcSE stat pvalue padj
	# ENSG00000000003.15 4224.253 0.327 0.214 1.531 0.126 0.342
	# ENSG00000000005.6 17.369 1.293 0.530 2.437 0.015 0.088
	# ENSG00000000419.13 1246.046 0.043 0.140 0.307 0.759 0.888

	```

	[DeSeq2 results converting ENSG IDs to Gene Symbols, more than one ENSG ID per Gene symbol](https://support.bioconductor.org/p/105430/)
	```{r GeneSymbol}
	if (FALSE) {
	# NOT WORKING
	# https://stackoverflow.com/a/28544212
	library('biomaRt')
	mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))

	genes <- c("ENSG00000260727", "ENSG00000277521", "ENSG00000116514")

	G_list <- getBM(filters = "ensembl_gene_id",
	attributes = c("ensembl_gene_id", "external_gene_name"),
	values = genes,
	mart = mart)
	}
	library(org.Hs.eg.db)
	library(AnnotationDbi)
	# mapIds(org.Hs.eg.db,
	# keys=genes,
	# column="SYMBOL", keytype="ENSEMBL", multiVals="first")

	dea$symbol <- mapIds(org.Hs.eg.db,
	keys=substr(rownames(dea), 1, 15),
	column="SYMBOL", keytype="ENSEMBL", multiVals="first")
	```

	```{r ReduceDuplicated}
	length(unique(dea$symbol))
	# 29322

	# keep the one with the largest overall expression value.
	o <- order(dea$baseMean, decreasing=TRUE)
	dea2 <- dea[o,]
	dea3 <- dea2[!duplicated(dea2$symbol),]
	dim(dea3)
	# 29322 7

	sum(is.na(dea3$symbol))
	# [1] 1
	dea4 <- dea3[! is.na(dea3$symbol), ]
	dim(dea4)
	# 29321 7
	```