Created
September 13, 2017 20:08
-
-
Save smrgit/9872726e76323b9d5de4e8e9fa54a2d4 to your computer and use it in GitHub Desktop.
GDC metadata exploration example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
WITH | |
-- | |
-- this initial table collects the case barcodes for all TCGA LAML and | |
-- TARGET AML cases from the two "Clinical" tables | |
-- Result: this sub-query returns a table with 1193 rows | |
t1 AS ( | |
SELECT | |
program_name, | |
case_barcode | |
FROM | |
`isb-cgc.TCGA_bioclin_v0.Clinical` | |
WHERE | |
project_short_name="TCGA-LAML" | |
UNION ALL | |
SELECT | |
program_name, | |
case_barcode | |
FROM | |
`isb-cgc.TARGET_bioclin_v0.Clinical` | |
WHERE | |
project_short_name="TARGET-AML" ), | |
-- | |
-- next, we use the GDC "cases" metadata table in order to get the GDC | |
-- case identifiers assigned to the case barcodes (aka 'submitter ids') | |
-- NB: this could also be done with a JOIN, but instead we use the | |
-- "WHERE ... IN ... " operator just because it can be a handy tool in | |
-- many situations | |
-- Result: this sub-query returns a table with 1135 rows, so 58 cases | |
-- from the "Clinical" table(s) were not found in the GDC metadata | |
t2 AS ( | |
SELECT | |
case_id, | |
submitter_id | |
FROM | |
`isb-cgc.GDC_metadata.rel8_caseData` | |
WHERE | |
submitter_id IN ( | |
SELECT | |
case_barcode | |
FROM | |
t1) ), | |
-- | |
-- next, we'll join the previous two tables | |
-- Result: this operation returns a table with 1135 rows, each containing | |
-- the program_name (eg "TCGA"), the case_barcode (eg "TCGA-AB-2869") | |
-- and the case_id (eg "e44e2ac3-4a83-40b0-b266-a36626ffd6ae") | |
j1 AS ( | |
SELECT | |
t1.program_name, | |
t1.case_barcode, | |
t2.case_id | |
FROM | |
t1 | |
JOIN | |
t2 | |
ON | |
t1.case_barcode=t2.submitter_id), | |
-- | |
-- next, we want to use the case_ids from above to find information | |
-- about available BAM files -- from the 'current' GDC archive, not | |
-- the legacy archive | |
-- Result: this step returns a table with 1150 rows -- some cases have | |
-- multiple associated BAM files, some have none | |
t3 AS ( | |
SELECT | |
file_id, | |
cases__project__project_id AS project_id, | |
associated_entities__case_id, | |
acl, | |
experimental_strategy, | |
analysis__workflow_type, | |
data_category, | |
data_format, | |
data_type | |
FROM | |
`isb-cgc.GDC_metadata.rel8_fileData_current` | |
WHERE | |
data_format="BAM" | |
AND associated_entities__case_id IN ( | |
SELECT | |
case_id | |
FROM | |
j1 ) ), | |
-- | |
-- and finally we will JOIN this list of BAM files with a table | |
-- containing a mapping from the GDC file_id to the ISB-CGC | |
-- bucket path in GCS (Google Cloud Storage) | |
-- Result: the table produced by this step also has 1150 rows, | |
-- indicating that every file previously identified is in fact | |
-- available in an ISB-CGC bucket | |
j2 AS ( | |
SELECT | |
* | |
FROM | |
t3 a | |
JOIN | |
`isb-cgc.GDC_metadata.rel8_GDCfileID_to_GCSurl` b | |
ON | |
a.file_id=b.file_gdc_id ) | |
-- | |
-- Perhaps at this point, the user might want to know more about these BAM files, | |
-- and might, for example (as shown below), count up the BAM files based on | |
-- information such as the ACL, project id, experimental strategy, and | |
-- analysis workflow type | |
SELECT | |
acl, | |
project_id, | |
experimental_strategy, | |
analysis__workflow_type, | |
COUNT(*) AS n | |
FROM | |
j2 | |
GROUP BY | |
acl, | |
project_id, | |
experimental_strategy, | |
analysis__workflow_type | |
ORDER BY | |
experimental_strategy, | |
analysis__workflow_type, | |
n DESC |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment