Created
September 27, 2017 22:36
-
-
Save smrgit/59cef721aa44aabf6b6564d51d674189 to your computer and use it in GitHub Desktop.
join several different metadata tables together to find normal BAMs in GCS, combined with their CGHub and/or GDC file identifiers (where available)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
WITH | |
-- | |
-- first, we grab information about the ~48650 TCGA DNA bam files that were at CGHub | |
-- (and are aligned to GRCh37/HG19 and were classified as "Live") | |
dbGaPinfo AS ( | |
SELECT | |
SUBSTR(barcode,1,12) AS case_barcode, | |
SUBSTR(barcode,1,16) AS sample_barcode, | |
barcode AS aliquot_barcode, | |
SUBSTR(barcode,14,2) AS sample_type, | |
analyte_type, | |
library_type, | |
platform, | |
assembly, | |
filename, | |
CAST(file_size AS INT64) AS file_size, | |
analysis_id AS CGHub_file_id | |
FROM | |
`isb-cgc.tcga_seq_metadata.CGHub_Manifest_24jun2016` | |
WHERE | |
file_type="bam" | |
AND study LIKE "%TCGA%" | |
AND barcode LIKE "TCGA-%" | |
AND (assembly LIKE "%37%" | |
OR assembly LIKE "%19%") | |
AND state="Live" | |
AND library_type NOT LIKE "%RNA%" | |
AND library_type NOT LIKE "Bisulfite%" ), | |
-- | |
-- next we grab information about the ~44100 TCGA DNA bam files in the | |
-- GDC legacy archive | |
GDCinfo AS ( | |
SELECT | |
SUBSTR(associated_entities__entity_submitter_id,1,12) AS case_barcode, | |
SUBSTR(associated_entities__entity_submitter_id,1,16) AS sample_barcode, | |
associated_entities__entity_submitter_id AS aliquot_barcode, | |
SUBSTR(associated_entities__entity_submitter_id,14,2) AS sample_type, | |
file_name, | |
file_size, | |
experimental_strategy, | |
file_id AS GDC_file_id | |
FROM | |
`isb-cgc.GDC_metadata.rel8_fileData_legacy` | |
WHERE | |
associated_entities__entity_type="aliquot" | |
AND cases__project__program__name="TCGA" | |
AND data_format="BAM" | |
AND data_type="Aligned reads" | |
AND data_category="Raw sequencing data" | |
AND experimental_strategy NOT LIKE "%RNA%" | |
AND experimental_strategy NOT LIKE "%Bisulfite%" ), | |
-- | |
-- then we do a FULL JOIN on the two tables above, using | |
-- the COALESCE function to collapse fields that should be | |
-- the same | |
-- this results in a table with ~55800 rows, where: | |
-- the CGHub_file_id is NULL for ~7155 | |
-- the GDC_file_id is NULL for ~11440 | |
-- neither is NULL for ~37220 | |
j1 AS ( | |
SELECT | |
COALESCE(a.case_barcode, | |
b.case_barcode) AS case_barcode, | |
COALESCE(a.sample_barcode, | |
b.sample_barcode) AS sample_barcode, | |
COALESCE(a.aliquot_barcode, | |
b.aliquot_barcode) AS aliquot_barcode, | |
COALESCE(a.sample_type, | |
b.sample_type) AS sample_type, | |
a.analyte_type, | |
COALESCE(b.experimental_strategy, | |
a.library_type) AS experimental_strategy, | |
a.platform, | |
a.assembly, | |
COALESCE(a.filename, | |
b.file_name) AS filename, | |
COALESCE(a.file_size, | |
b.file_size) AS file_size, | |
a.CGHub_file_id, | |
b.GDC_file_id | |
FROM | |
dbGaPinfo a | |
FULL JOIN | |
GDCinfo b | |
ON | |
a.aliquot_barcode=b.aliquot_barcode | |
AND a.filename=b.file_name | |
AND a.file_size=b.file_size ), | |
-- | |
-- next we do a LEFT JOIN with a table that contains the GCS urls | |
-- NB: this table will contain ~840 duplicates due to a few cases | |
-- where a particular file is stored twice in GCS | |
-- (this happened for various historical/artifactual reasons) | |
j2 AS ( | |
SELECT | |
case_barcode, | |
sample_barcode, | |
aliquot_barcode, | |
sample_type, | |
analyte_type, | |
experimental_strategy, | |
platform, | |
assembly, | |
bam_filename, | |
bam_size, | |
CGHub_file_id, | |
GDC_file_id, | |
bam_gcs_url, | |
bai_gcs_url | |
FROM | |
j1 a | |
LEFT JOIN | |
`isb-cgc.metadata.bams_in_gcs` b | |
ON | |
a.filename=b.bam_filename | |
AND a.file_size=b.bam_size ), | |
-- | |
-- the preceding j2 table is the main result that we want, which | |
-- could be saved for further querying, but for now let's do some | |
-- filtering directly on that result ... | |
-- here we want to count up the # of BAM files for normal samples, | |
-- by case, that are not WGS BAMS and *are* ILLUMINA bams: | |
-- (result is a table with ~18100 rows) | |
f1 AS ( | |
SELECT | |
case_barcode, | |
experimental_strategy, | |
platform, | |
assembly, | |
COUNT(*) AS n | |
FROM | |
j2 | |
WHERE | |
sample_type LIKE "1%" | |
AND experimental_strategy<>"WGS" | |
AND platform="ILLUMINA" | |
GROUP BY | |
1, | |
2, | |
3, | |
4 ) | |
-- | |
-- and finally, from the above, we want to know how many | |
-- distinct cases we have at least one normal BAM for: | |
SELECT | |
DISTINCT case_barcode | |
FROM | |
f1 | |
ORDER BY | |
case_barcode |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment