Last active
September 4, 2023 15:42
Star
You must be signed in to star a gist
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "02_bioc_curated_TCGA_data" | |
author: "Ming Tang" | |
date: "November 9, 2017" | |
output: html_document | |
--- | |
```{r} | |
devtools::install_github(repo = "BioinformaticsFMRP/TCGAbiolinks") | |
library(here) | |
library(TCGAbiolinks) | |
library(SummarizedExperiment) | |
library(tidyverse) | |
``` | |
### Download RNAseq data | |
```{r} | |
setwd("data/rnaseq_hg38_htseq") | |
## only hg38 has FPKM upper-quantile normalized and raw counts | |
SKCM_rna_query <- GDCquery(project = "TCGA-SKCM", | |
data.category = "Transcriptome Profiling", | |
data.type = "Gene Expression Quantification", | |
workflow.type = "HTSeq - Counts") | |
# hg19 is the RSEM results and the estimated counts | |
# SKCM_rna_query <- GDCquery(project = "TCGA-SKCM", | |
# data.category = "Gene expression", | |
# data.type = "Gene expression quantification", | |
# platform = "Illumina HiSeq", | |
# file.type = "results", | |
# experimental.strategy = "RNA-Seq", | |
# legacy = TRUE) | |
# Download a list of barcodes with platform IlluminaHiSeq_RNASeqV2 | |
GDCdownload(SKCM_rna_query, method = "client") | |
# Prepare expression matrix with geneID in the rows and samples (barcode) in the columns | |
# rsem.genes.results as values | |
SKCMRnaseqSE <- GDCprepare(SKCM_rna_query) | |
rowData(SKCMRnaseqSE) | |
colData(SKCMRnaseqSE) | |
metadata(SKCMRnaseqSE) | |
## raw counts | |
SKCMMatrix <- assay(SKCMRnaseqSE) | |
## save the raw counts. | |
save(SKCMRnaseqSE, file = "SKCMRnaseqSE.rda") | |
load("SKCMRnaseqSE.rda") | |
``` | |
#### Convert raw counts to TPM for each sample | |
the EnsDb is v75, the raw counts are from hg38 ref genome. The gene id should be the same. | |
sanity check! | |
```{r} | |
#source("https://bioconductor.org/biocLite.R") | |
#biocLite("EnsDb.Hsapiens.v75") | |
#create a tx2gene.txt table | |
library(EnsDb.Hsapiens.v75) | |
edb <- EnsDb.Hsapiens.v75 | |
genes_ensemble <- genes(edb) | |
gene_length<- as.data.frame(genes_ensemble) %>% dplyr::select(gene_id, gene_name, width) | |
### get rid of the 0 counts genes first. | |
SKCMMatrix_clean<- SKCMMatrix[rowSums(SKCMMatrix) > 5, ] | |
gene_length_in_mat<- left_join(data.frame(gene_id = rownames(SKCMMatrix_clean)), gene_length) %>% filter(!is.na(width)) | |
SKCMMatrix_sub<- SKCMMatrix_clean[rownames(SKCMMatrix_clean) %in% gene_length_in_mat$gene_id, ] | |
## now, just use gene_length instead of effective length. effective length = gene length - fragment_length + 1 | |
## we do not have fragment length for each data set (neeed to infer from the fastq files) | |
all.equal(rownames(SKCMMatrix_sub), gene_length_in_mat$gene_id) | |
countToTpm <- function(counts, effLen) | |
{ | |
rate <- log(counts + 1) - log(effLen) | |
denom <- log(sum(exp(rate))) | |
exp(rate - denom + log(1e6)) | |
} | |
SKCM_TPM<- apply(SKCMMatrix_sub,2, countToTpm, effLen = gene_length_in_mat$width) | |
head(SKCM_TPM) | |
hist(SKCM_TPM) | |
``` | |
### subset only our samples | |
```{r} | |
library(readxl) | |
library(stringr) | |
SKCM_Terrance_meta<- read_xlsx("data/meta_files/TCGA_SKCM_SubtypeInformation_Terrence_10-31-17.infomat.xlsx") | |
View(SKCM_Terrance_meta) | |
SKCM_Terrance_meta$Name | |
## make sure the first 15 letters are unique | |
colnames(SKCM_TPM) %>% length() | |
colnames(SKCM_TPM) %>% str_sub(1,15) %>% unique() %>% length() | |
colnames(SKCM_TPM)<- colnames(SKCM_TPM) %>% str_sub(1,15) | |
SKCM_TPM_sub<- SKCM_TPM[, colnames(SKCM_TPM) %in% SKCM_Terrance_meta$Name] | |
write.table(SKCM_TPM, "data/rnaseq_hg38_htseq/SKCM_TPM.txt", sep = "\t", col.names = T, | |
row.names = T, quote = F) | |
write.table(SKCM_TPM_sub, "data/rnaseq_hg38_htseq/SKCM_TPM_sub.txt", sep = "\t", col.names = T, | |
row.names = T, quote = F) | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment