Skip to content

Instantly share code, notes, and snippets.

@tiagochst
Last active November 9, 2023 12:56
Show Gist options
  • Star 6 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tiagochst/a701bad3fa3800ade7063760755e0aad to your computer and use it in GitHub Desktop.
Save tiagochst/a701bad3fa3800ade7063760755e0aad to your computer and use it in GitHub Desktop.
Using TCGAbiolinks with GDC (still in development)
# ------------------------------------------------------------------
# Updating TCGAbiolinks to work with GDC data
# --------------------------------------------------------------------
# Install last version from the github (this is a development version)
devtools::install_github("BioinformaticsFMRP/TCGAbiolinks")
library(TCGAbiolinks)
####################### Working harmonized data ###########################
# Data.category: clinical and biospecimen
############################################################################
# Clinical information
# https://gdc.nci.nih.gov/about-data/data-harmonization-and-generation/clinical-data-harmonization
clin <- GDCquery_clinic("TCGA-ACC", type = "clinical", save.csv = TRUE)
clin <- GDCquery_clinic("TCGA-ACC", type = "biospecimen", save.csv = TRUE)
#-----------------------------------------------------------------------------
# Data.category: MAF files
#-----------------------------------------------------------------------------
mut <- GDCquery_Maf(tumor = "ACC")
clin <- GDCquery_clinic("TCGA-ACC","clinical")
clin <- clin[,c("bcr_patient_barcode","disease","gender","tumor_stage","race","vital_status")]
TCGAvisualize_oncoprint(mut = mut, genes = mut$Hugo_Symbol[1:20],
filename = "onco.pdf",
annotation = clin,
color=c("background"="#CCCCCC","DEL"="purple","INS"="yellow","SNP"="brown"),
rows.font.size=10,
heatmap.legend.side = "right",
dist.col = 0,
label.font.size = 10)
#-----------------------------------------------------------------------------
# Data.category: Copy number variation
#-----------------------------------------------------------------------------
query <- GDCquery(project = "TCGA-ACC",
data.category = "Copy Number Variation",
data.type = "Copy Number Segment",
barcode = c( "TCGA-OR-A5KU-01A-11D-A29H-01", "TCGA-OR-A5JK-01A-11D-A29H-01"))
GDCdownload(query)
data <- GDCprepare(query)
query <- GDCquery("TCGA-ACC",
"Copy Number Variation",
data.type = "Masked Copy Number Segment",
sample.type = c("Primary solid Tumor")) # query$results[[1]]$cases
GDCdownload(query)
data <- GDCprepare(query)
#-----------------------------------------------------------------------------
# Data.category: Transcriptome Profiling
#-----------------------------------------------------------------------------
workflow.type <- c("HTSeq - Counts", "HTSeq - FPKM","HTSeq - FPKM-UQ")
for(i in workflow.type){
print(i)
query <- GDCquery(project = "TARGET-AML",
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = i,
barcode = c("TARGET-20-PADZCG-04A-01R","TARGET-20-PARJCR-09A-01R"))
GDCdownload(query)
data <- GDCprepare(query)
}
#data.type <- c("miRNA Expression Quantification","Isoform Expression Quantification")
data.type <- c("miRNA Expression Quantification")
for(i in data.type){
print(i)
query <- GDCquery(project = "TARGET-AML",
data.category = "Transcriptome Profiling",
data.type = i,
workflow.type = "BCGSC miRNA Profiling",
barcode = c("TARGET-20-PARUDL-03A-01R","TARGET-20-PASRRB-03A-01R"))
GDCdownload(query)
data <- GDCprepare(query)
print(head(data))
}
####################### Working with Legacy data ###########################
# Data.category: Copy number variation
############################################################################
query <- GDCquery(project = "TCGA-ACC",
data.category = "Copy number variation",
legacy = TRUE,
file.type = "nocnv_hg19.seg",
barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01", "TCGA-OR-A5LJ-10A-01D-A29K-01"))
GDCdownload(query)
z <- GDCprepare(query)
query <- GDCquery(project = "TCGA-ACC",
data.category = "Copy number variation",
legacy = TRUE,
file.type = "nocnv_hg18.seg",
barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01", "TCGA-OR-A5LJ-10A-01D-A29K-01"))
GDCdownload(query)
z <- GDCprepare(query)
query <- GDCquery(project = "TCGA-ACC",
data.category = "Copy number variation",
legacy = TRUE,
file.type = "hg19.seg",
barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01", "TCGA-OR-A5LJ-10A-01D-A29K-01"))
GDCdownload(query)
z <- GDCprepare(query)
query <- GDCquery(project = "TCGA-LGG",
barcode = c("TCGA-HT-7476-10A-01D-2022-02", "TCGA-FG-6689-01A-11D-1891-02"),
data.category = "Copy number variation", platform = "Illumina HiSeq", legacy = TRUE)
GDCdownload(query)
z <- GDCprepare(query)
query <- GDCquery(project = "TCGA-ACC",
data.category = "Copy number variation",
legacy = TRUE,
file.type = "hg18.seg",
barcode = c("TCGA-OR-A5LR-01A-11D-A29H-01", "TCGA-OR-A5LJ-10A-01D-A29K-01"))
GDCdownload(query)
z <- GDCprepare(query)
####################### Working with Legacy data ###########################
# Data.category: DNA methylation & Protein expression
############################################################################
# Function to get two samples to test the function
legacyPipeline <- function(project, data.category, platform){
query <- GDCquery(project = project,
data.category = data.category,
platform = platform,
legacy = TRUE)
cases <- query$results[[1]]$cases[1:2]
query <- GDCquery(project = project,
data.category = data.category,
platform = platform,
legacy = TRUE,
barcode = cases)
GDCdownload(query)
data <- GDCprepare(query)
return(data)
}
# DNA methylation
data <- legacyPipeline("TCGA-GBM","DNA methylation","Illumina Human Methylation 27")
data <- legacyPipeline("TCGA-GBM","DNA methylation","Illumina Human Methylation 450")
data <- legacyPipeline("TCGA-GBM","DNA methylation","Illumina DNA Methylation OMA003 CPI")
data <- legacyPipeline("TCGA-GBM","DNA methylation","Illumina DNA Methylation OMA002 CPI")
data <- legacyPipeline("TCGA-GBM","DNA methylation","Illumina DNA Methylation OMA002 CPI")
# Protein expression
data <- legacyPipeline("TCGA-GBM","Protein expression","MDA_RPPA_Core")
@behnazhoseyni
Copy link

hello . i need slide image for TCGA project from gdc.
i can get this by gdc portal and set data type=slide image ,Experimental Strategy=Diagnostic Slide.
but i need get this data with api method and R.
i try with :
GDCquery(project = "TCGA-OV",data.category = "Biospecimen",
data.type = 'Slide Image')

but i get this :
|sort(harmonized.data.type) |
|:---------------------------------|
|Biospecimen Supplement |
|Clinical Supplement |
|Copy Number Segment |
|Gene Expression Quantification |
|Gene Level Copy Number Scores |
|Isoform Expression Quantification |
|Masked Copy Number Segment |
|Masked Somatic Mutation |
|miRNA Expression Quantification |
Error in checkDataTypeInput(legacy = legacy, data.type = data.type) :
Please set a data.type argument from the column harmonized.data.type above

please help me.

@tiagochst
Copy link
Author

hello . i need slide image for TCGA project from gdc.
i can get this by gdc portal and set data type=slide image ,Experimental Strategy=Diagnostic Slide.
but i need get this data with api method and R.
i try with :
GDCquery(project = "TCGA-OV",data.category = "Biospecimen",
data.type = 'Slide Image')

but i get this :

sort(harmonized.data.type)
Biospecimen Supplement
Clinical Supplement
Copy Number Segment
Gene Expression Quantification
Gene Level Copy Number Scores
Isoform Expression Quantification
Masked Copy Number Segment
Masked Somatic Mutation
miRNA Expression Quantification
Error in checkDataTypeInput(legacy = legacy, data.type = data.type) :
Please set a data.type argument from the column harmonized.data.type above

please help me.

I just updated the data.type check. Please you need to update the package from github?
You can update with:


withr::with_envvar(c(R_REMOTES_NO_ERRORS_FROM_WARNINGS="true"),
 remotes::install_github('BioinformaticsFMRP/TCGAbiolinks')
) 

The code below should work:

query <- GDCquery(project = "TCGA-OV",
                  data.category = "Biospecimen",
                  data.type = 'Slide Image')
GDCdownload(query,files.per.chunk = 2)

@behnazhoseyni
Copy link

thank you.
i updated , but doesn't work ,and get same error

@tiagochst
Copy link
Author

tiagochst commented Aug 29, 2019 via email

@behnazhoseyni
Copy link

yes.

@behnazhoseyni
Copy link

behnazhoseyni commented Aug 29, 2019

please check this page
http://www.bioconductor.org/packages/devel/bioc/vignettes/TCGAbiolinks/inst/doc/download_prepare.html
i find this sentence about 'Harmonized data'
This function is still under development, it is not working for all cases.
is it the reason that above code doesn't work?

@tiagochst
Copy link
Author

It is working here:

Screenshot from 2019-08-29 15-28-25

@behnazhoseyni
Copy link

oh, i'm amazed
i try to find why it isn't work.
thank you so much.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment