Skip to content

Instantly share code, notes, and snippets.

@crazyhottommy
Last active September 4, 2023 15:42
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save crazyhottommy/b6ada3da075f84f037f0c1337147e540 to your computer and use it in GitHub Desktop.
---
title: "02_bioc_curated_TCGA_data"
author: "Ming Tang"
date: "November 9, 2017"
output: html_document
---
```{r}
devtools::install_github(repo = "BioinformaticsFMRP/TCGAbiolinks")
library(here)
library(TCGAbiolinks)
library(SummarizedExperiment)
library(tidyverse)
```
### Download RNAseq data
```{r}
setwd("data/rnaseq_hg38_htseq")
## only hg38 has FPKM upper-quantile normalized and raw counts
SKCM_rna_query <- GDCquery(project = "TCGA-SKCM",
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - Counts")
# hg19 is the RSEM results and the estimated counts
# SKCM_rna_query <- GDCquery(project = "TCGA-SKCM",
# data.category = "Gene expression",
# data.type = "Gene expression quantification",
# platform = "Illumina HiSeq",
# file.type = "results",
# experimental.strategy = "RNA-Seq",
# legacy = TRUE)
# Download a list of barcodes with platform IlluminaHiSeq_RNASeqV2
GDCdownload(SKCM_rna_query, method = "client")
# Prepare expression matrix with geneID in the rows and samples (barcode) in the columns
# rsem.genes.results as values
SKCMRnaseqSE <- GDCprepare(SKCM_rna_query)
rowData(SKCMRnaseqSE)
colData(SKCMRnaseqSE)
metadata(SKCMRnaseqSE)
## raw counts
SKCMMatrix <- assay(SKCMRnaseqSE)
## save the raw counts.
save(SKCMRnaseqSE, file = "SKCMRnaseqSE.rda")
load("SKCMRnaseqSE.rda")
```
#### Convert raw counts to TPM for each sample
the EnsDb is v75, the raw counts are from hg38 ref genome. The gene id should be the same.
sanity check!
```{r}
#source("https://bioconductor.org/biocLite.R")
#biocLite("EnsDb.Hsapiens.v75")
#create a tx2gene.txt table
library(EnsDb.Hsapiens.v75)
edb <- EnsDb.Hsapiens.v75
genes_ensemble <- genes(edb)
gene_length<- as.data.frame(genes_ensemble) %>% dplyr::select(gene_id, gene_name, width)
### get rid of the 0 counts genes first.
SKCMMatrix_clean<- SKCMMatrix[rowSums(SKCMMatrix) > 5, ]
gene_length_in_mat<- left_join(data.frame(gene_id = rownames(SKCMMatrix_clean)), gene_length) %>% filter(!is.na(width))
SKCMMatrix_sub<- SKCMMatrix_clean[rownames(SKCMMatrix_clean) %in% gene_length_in_mat$gene_id, ]
## now, just use gene_length instead of effective length. effective length = gene length - fragment_length + 1
## we do not have fragment length for each data set (neeed to infer from the fastq files)
all.equal(rownames(SKCMMatrix_sub), gene_length_in_mat$gene_id)
countToTpm <- function(counts, effLen)
{
rate <- log(counts + 1) - log(effLen)
denom <- log(sum(exp(rate)))
exp(rate - denom + log(1e6))
}
SKCM_TPM<- apply(SKCMMatrix_sub,2, countToTpm, effLen = gene_length_in_mat$width)
head(SKCM_TPM)
hist(SKCM_TPM)
```
### subset only our samples
```{r}
library(readxl)
library(stringr)
SKCM_Terrance_meta<- read_xlsx("data/meta_files/TCGA_SKCM_SubtypeInformation_Terrence_10-31-17.infomat.xlsx")
View(SKCM_Terrance_meta)
SKCM_Terrance_meta$Name
## make sure the first 15 letters are unique
colnames(SKCM_TPM) %>% length()
colnames(SKCM_TPM) %>% str_sub(1,15) %>% unique() %>% length()
colnames(SKCM_TPM)<- colnames(SKCM_TPM) %>% str_sub(1,15)
SKCM_TPM_sub<- SKCM_TPM[, colnames(SKCM_TPM) %in% SKCM_Terrance_meta$Name]
write.table(SKCM_TPM, "data/rnaseq_hg38_htseq/SKCM_TPM.txt", sep = "\t", col.names = T,
row.names = T, quote = F)
write.table(SKCM_TPM_sub, "data/rnaseq_hg38_htseq/SKCM_TPM_sub.txt", sep = "\t", col.names = T,
row.names = T, quote = F)
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment