Skip to content

Instantly share code, notes, and snippets.

@crazyhottommy
Forked from tiagochst/get_all_clin.R
Last active April 21, 2017 19:05
Show Gist options
  • Save crazyhottommy/fd7731dfd7ce0a7f13b7b80bb6c79627 to your computer and use it in GitHub Desktop.
Save crazyhottommy/fd7731dfd7ce0a7f13b7b80bb6c79627 to your computer and use it in GitHub Desktop.
# This code will get all clinical indexed data from TCGA
library(TCGAbiolinks)
library(data.table)
clinical <- TCGAbiolinks:::getGDCprojects()$project_id %>%
regexPipes::grep("TCGA",value=T) %>%
sort %>%
plyr::alply(1,GDCquery_clinic, .progress = "text") %>%
rbindlist
readr::write_csv(clinical,path = paste0("all_clin_indexed.csv"))
# This code will get all clinical XML data from TCGA
getclinical <- function(proj){
message(proj)
while(1){
result = tryCatch({
query <- GDCquery(project = proj, data.category = "Clinical")
GDCdownload(query)
clinical <- GDCprepare_clinic(query, clinical.info = "patient")
for(i in c("admin","radiation","follow_up","drug","new_tumor_event")){
message(i)
aux <- GDCprepare_clinic(query, clinical.info = i)
if(is.null(aux)) next
# add suffix manually if it already exists
replicated <- which(grep("bcr_patient_barcode",colnames(aux), value = T,invert = T) %in% colnames(clinical))
colnames(aux)[replicated] <- paste0(colnames(aux)[replicated],".",i)
if(!is.null(aux)) clinical <- merge(clinical,aux,by = "bcr_patient_barcode", all = TRUE)
}
readr::write_csv(clinical,path = paste0(proj,"_clinical_from_XML.csv")) # Save the clinical data into a csv file
return(clinical)
}, error = function(e) {
message(paste0("Error clinical: ", proj))
})
}
}
clinical <- TCGAbiolinks:::getGDCprojects()$project_id %>%
regexPipes::grep("TCGA",value=T) %>%
sort %>%
plyr::alply(1,getclinical, .progress = "text") %>%
rbindlist(fill = TRUE) %>% setDF
readr::write_csv(clinical,path = paste0("all_clin_XML.csv"))
# Get all batch numbers for each patient
library(TCGAbiolinks)
getBatch <- function(proj){
message(proj)
while(1){
result = tryCatch({
query <- GDCquery(project = proj, data.category = "Biospecimen")
GDCdownload(query)
clinical <- GDCprepare_clinic(query, clinical.info = "admin")
readr::write_csv(clinical,path = paste0(proj,"_batch_from_XML.csv")) # Save the clinical data into a csv file
return(clinical)
}, error = function(e) {
message(paste0("Error clinical: ", proj))
})
}
}
biospecimen <- TCGAbiolinks:::getGDCprojects()$project_id %>%
regexPipes::grep("TCGA",value=T) %>%
sort %>%
plyr::alply(1,getBatch, .progress = "text") %>%
rbindlist(fill = TRUE) %>% setDF
readr::write_csv(biospecimen,path = paste0("biospecimen_from_XML.csv"))

From this post https://support.bioconductor.org/p/89315/

library('TCGAbiolinks')
library('plyr')
library('devtools')
projects <- TCGAbiolinks:::getGDCprojects()$project_id
projects <- projects[grepl('^TCGA', projects, perl = TRUE)]

clin <- lapply(projects, function(p) {
    message(paste(Sys.time(), 'processing project', p))
    result <- tryCatch({
        query <- GDCquery(project = p, data.category = 'Clinical')
        GDCdownload(query)
        GDCprepare_clinic(query, clinical.info = 'patient')
    }, error = function(e) {
        message(paste0('Error clinical: ', p))
        return(NULL)
    })
    return(result)
})
names(clin) <- projects

## Merge all
clin_all <- rbind.fill(clin)

## Fix columns that have '' that should be NAs
for(j in seq_len(ncol(clin_all))) {
    i <- which(clin_all[, j] == '')
    if(length(i) > 0) clin_all[i, j] <- NA
}

save(clin_all, file = 'clin_all.Rdata')

write.table(clin_all, file = 'clin_all.tsv', quote = FALSE, row.names = FALSE,
    sep = '\t')

## Reproducibility info
Sys.time()
options(width = 120)
session_info()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment