Skip to content

Instantly share code, notes, and snippets.

@oganm
Last active August 10, 2020 12:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save oganm/4dc620d75a893608b4862e11a31c0050 to your computer and use it in GitHub Desktop.
Save oganm/4dc620d75a893608b4862e11a31c0050 to your computer and use it in GitHub Desktop.
yeast download for jesse
library(gemmaAPI)
# identify all yeast datasets
yeastStudies = taxonInfo('yeast',request = 'datasets',limit = 0)
studyIDs = yeastStudies %>% purrr::map_chr('id')
# get metadata for yeast studies
yeastMetadata = studyIDs %>% lapply(compileMetadata,outputType = 'list')
quality = yeastMetadata %>% purrr::map('experimentData') %>% purrr::map_dbl('geeq.qualityScore')
quality %>% hist()
# count datasets for platforms. note that there are only 3 experiments between
# the most common (GPL2529, 58 experiments, 4618 annotated genes, 10928 probes)
# and the second most common (GPL90, 55 experiments, 5549 annotated genes, 9335
# probes). don't have much experience with yeast platforms myself so can't
# comment on their relative quality. This gets the data for both of these,
# saved to different directories
platforms = yeastMetadata %>%
purrr::map('experimentData') %>%
purrr::map_chr('platformName')
popularPlatforms = platforms %>%
table %>%
sort(decreasing = TRUE) %>%
names %>% {.[1:2]}
# filter metadata do only include the datasets with the popularPlatforms
yeastMetadata = yeastMetadata[platforms %in% popularPlatforms]
# get platforms for each dataset
platforms = yeastMetadata %>%
purrr::map('experimentData') %>%
purrr::map_chr('platformName')
studyIDs = yeastMetadata %>%
purrr::map('experimentData') %>% purrr::map_chr('datasetID')
diffs = studyIDs %>%
lapply(datasetInfo,request = 'differential', offset =0, limit = 0)
differentials = diffs %>% purrr::map(names)
# split the platforms in case you want to analyze them separately or ignore one of them
popularPlatforms %>% lapply(function(platform){
studyIDs = yeastMetadata[platforms == platform] %>%
purrr::map('experimentData') %>% purrr::map_chr('datasetID')
# get differential IDs from the datasets
differentials = studyIDs %>%
lapply(datasetInfo,request = 'differential', offset =0, limit = 0) %>%
purrr::map(names)
# download each differential for each dataset
dir.create(platform, showWarnings = FALSE)
names(differentials) %>% lapply(function(x){
differentials[[x]] %>% lapply(function(id){
print(x)
# these two studies returned errors.
if(x %in% c('GSE17716','GSE40399')){
return(NULL)
}
# if is safe to remove
if(!file.exists(glue::glue('{platform}/{x}_{id}'))){
datasetInfo(x, request = 'degs',differential = id,file = glue::glue('{platform}/{x}_{id}'),return=FALSE)
} else{
print('exists already. skipping')
}
})
})
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment