Last active
March 11, 2024 00:27
-
-
Save arraytools/6e6142f6fabb31e54e188ea1fb0deeee to your computer and use it in GitHub Desktop.
This is an updated/enhanced version of [TCGAbiolinks to DESEq2](https://rpubs.com/tiagochst/TCGAbiolinks_to_DESEq2)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Download Raw counts | |
```{r} | |
library(TCGAbiolinks) | |
proj <- "TCGA-GBM" | |
query <- GDCquery( | |
project = proj, | |
data.category = "Transcriptome Profiling", | |
data.type = "Gene Expression Quantification", | |
workflow.type = "STAR - Counts" | |
) | |
GDCdownload(query) | |
# Downloading data for project TCGA-GBM | |
# Of the 175 files for download 2 already exist. | |
# We will download only those that are missing ones. | |
# GDCdownload will download 173 files. A total of 733.697085 MB | |
# Downloading as: Sun_Mar_10_16_33_45_2024.tar.gz | |
data <- GDCprepare(query) # 60660 175 | |
class(data) | |
# [1] "RangedSummarizedExperiment" | |
# attr(,"package") | |
# [1] "SummarizedExperiment" | |
``` | |
Remove samples with NAs annotation | |
```{r} | |
data <- data[,!is.na(data$paper_IDH.status)] | |
dim(data) | |
# [1] 60660 153 | |
data | |
# class: RangedSummarizedExperiment | |
# dim: 60660 153 | |
# metadata(1): data_release | |
# assays(6): unstranded stranded_first ... fpkm_unstrand fpkm_uq_unstrand | |
# rownames(60660): ENSG00000000003.15 ENSG00000000005.6 ... | |
# ENSG00000288674.1 ENSG00000288675.1 | |
# rowData names(10): source type ... hgnc_id havana_gene | |
# colnames(153): TCGA-06-5410-01A-01R-1849-01 TCGA-19-5960-01A-11R-1850-01 | |
# ... TCGA-06-0187-01A-01R-1849-01 TCGA-27-2519-01A-01R-1850-01 | |
# colData names(109): barcode patient ... | |
# paper_Telomere.length.estimate.in.blood.normal..Kb. | |
# paper_Telomere.length.estimate.in.tumor..Kb. | |
colData(data) |> dim() | |
# [1] 153 109 | |
table(colData(data)$paper_IDH.status) | |
# Mutant WT | |
# 11 142 | |
assay(data)[1:5, 1:2] | |
# TCGA-06-5410-01A-01R-1849-01 TCGA-19-5960-01A-11R-1850-01 | |
# ENSG00000000003.15 2203 6187 | |
# ENSG00000000005.6 3 20 | |
# ENSG00000000419.13 1509 1608 | |
# ENSG00000000457.14 478 835 | |
# ENSG00000000460.17 357 594 | |
``` | |
Perform DE analysis | |
```{r DESeq2} | |
library(DESeq2) | |
ddsSE <- DESeqDataSet(data, design = ~ paper_IDH.status) | |
keep <- rowSums(counts(ddsSE)) >= 10 | |
ddsSE <- ddsSE[keep,] | |
ddsSE <- DESeq(ddsSE) | |
# estimating size factors | |
# estimating dispersions | |
# gene-wise dispersion estimates | |
# mean-dispersion relationship | |
# final dispersion estimates | |
# fitting model and testing | |
# -- replacing outliers and refitting for 2134 genes | |
# -- DESeq argument 'minReplicatesForReplace' = 7 | |
# -- original counts are preserved in counts(dds) | |
# estimating dispersions | |
# fitting model and testing | |
resultsNames(ddsSE) | |
# 1] "Intercept" "paper_IDH.status_WT_vs_Mutant" | |
res <- results(ddsSE, name = "paper_IDH.status_WT_vs_Mutant") | |
dea <- as.data.frame(res) | |
summary(res) | |
# out of 47965 with nonzero total read count | |
# adjusted p-value < 0.1 | |
# LFC > 0 (up) : 3464, 7.2% | |
# LFC < 0 (down) : 2821, 5.9% | |
# outliers [1] : 0, 0% | |
# low counts [2] : 13020, 27% | |
# (mean count < 1) | |
# [1] see 'cooksCutoff' argument of ?results | |
# [2] see 'independentFiltering' argument of ?results | |
dim(dea) | |
# [1] 47966 6 | |
dea[1:3, ] |> round(3) | |
# baseMean log2FoldChange lfcSE stat pvalue padj | |
# ENSG00000000003.15 4224.253 0.327 0.214 1.531 0.126 0.342 | |
# ENSG00000000005.6 17.369 1.293 0.530 2.437 0.015 0.088 | |
# ENSG00000000419.13 1246.046 0.043 0.140 0.307 0.759 0.888 | |
``` | |
[DeSeq2 results converting ENSG IDs to Gene Symbols, more than one ENSG ID per Gene symbol](https://support.bioconductor.org/p/105430/) | |
```{r GeneSymbol} | |
if (FALSE) { | |
# NOT WORKING | |
# https://stackoverflow.com/a/28544212 | |
library('biomaRt') | |
mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl")) | |
genes <- c("ENSG00000260727", "ENSG00000277521", "ENSG00000116514") | |
G_list <- getBM(filters = "ensembl_gene_id", | |
attributes = c("ensembl_gene_id", "external_gene_name"), | |
values = genes, | |
mart = mart) | |
} | |
library(org.Hs.eg.db) | |
library(AnnotationDbi) | |
# mapIds(org.Hs.eg.db, | |
# keys=genes, | |
# column="SYMBOL", keytype="ENSEMBL", multiVals="first") | |
dea$symbol <- mapIds(org.Hs.eg.db, | |
keys=substr(rownames(dea), 1, 15), | |
column="SYMBOL", keytype="ENSEMBL", multiVals="first") | |
``` | |
```{r ReduceDuplicated} | |
length(unique(dea$symbol)) | |
# 29322 | |
# keep the one with the largest overall expression value. | |
o <- order(dea$baseMean, decreasing=TRUE) | |
dea2 <- dea[o,] | |
dea3 <- dea2[!duplicated(dea2$symbol),] | |
dim(dea3) | |
# 29322 7 | |
sum(is.na(dea3$symbol)) | |
# [1] 1 | |
dea4 <- dea3[! is.na(dea3$symbol), ] | |
dim(dea4) | |
# 29321 7 | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment