Created
September 28, 2017 11:44
-
-
Save smrgit/eb150c3485f631df16169ad7efa0e8b9 to your computer and use it in GitHub Desktop.
join CGHub and GDC legacy information for TCGA DNA bams
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
WITH | |
-- | |
-- first, we grab information about the ~48650 TCGA DNA bam files that were at CGHub | |
-- (and are aligned to GRCh37/HG19 and were classified as "Live") | |
CGHubInfo AS ( | |
SELECT | |
SUBSTR(barcode,1,12) AS case_barcode, | |
SUBSTR(barcode,1,16) AS sample_barcode, | |
barcode AS aliquot_barcode, | |
SUBSTR(barcode,14,2) AS sample_type, | |
analyte_type, | |
library_type, | |
platform, | |
assembly, | |
filename, | |
CAST(file_size AS INT64) AS file_size, | |
analysis_id AS CGHub_file_id, | |
study AS CGHub_study | |
FROM | |
`isb-cgc.tcga_seq_metadata.CGHub_Manifest_24jun2016` | |
WHERE | |
file_type="bam" | |
AND study LIKE "%TCGA%" | |
AND barcode LIKE "TCGA-%" | |
AND (assembly LIKE "%37%" | |
OR assembly LIKE "%19%") | |
AND state="Live" | |
AND library_type NOT LIKE "%RNA%" | |
AND library_type NOT LIKE "Bisulfite%" ), | |
-- | |
-- next we grab information about the ~44100 TCGA DNA bam files in the | |
-- GDC legacy archive | |
GDCinfo AS ( | |
SELECT | |
SUBSTR(associated_entities__entity_submitter_id,1,12) AS case_barcode, | |
SUBSTR(associated_entities__entity_submitter_id,1,16) AS sample_barcode, | |
associated_entities__entity_submitter_id AS aliquot_barcode, | |
SUBSTR(associated_entities__entity_submitter_id,14,2) AS sample_type, | |
file_name, | |
file_size, | |
experimental_strategy, | |
file_id AS GDC_legacy_file_id | |
FROM | |
`isb-cgc.GDC_metadata.rel8_fileData_legacy` | |
WHERE | |
associated_entities__entity_type="aliquot" | |
AND cases__project__program__name="TCGA" | |
AND data_format="BAM" | |
AND data_type="Aligned reads" | |
AND data_category="Raw sequencing data" | |
AND experimental_strategy NOT LIKE "%RNA%" | |
AND experimental_strategy NOT LIKE "%Bisulfite%" ), | |
-- | |
-- then we do a FULL JOIN on the two tables above, using | |
-- the COALESCE function to collapse fields that should be | |
-- the same | |
-- this results in a table with ~55800 rows, where: | |
-- the CGHub_file_id is NULL for ~7155 | |
-- the GDC_file_id is NULL for ~11440 | |
-- neither is NULL for ~37220 | |
j1 AS ( | |
SELECT | |
COALESCE(a.case_barcode, | |
b.case_barcode) AS case_barcode, | |
COALESCE(a.sample_barcode, | |
b.sample_barcode) AS sample_barcode, | |
COALESCE(a.aliquot_barcode, | |
b.aliquot_barcode) AS aliquot_barcode, | |
COALESCE(a.sample_type, | |
b.sample_type) AS sample_type, | |
a.analyte_type, | |
COALESCE(b.experimental_strategy, | |
a.library_type) AS experimental_strategy, | |
a.platform, | |
a.assembly, | |
COALESCE(a.filename, | |
b.file_name) AS filename, | |
COALESCE(a.file_size, | |
b.file_size) AS file_size, | |
a.CGHub_study, | |
a.CGHub_file_id, | |
b.GDC_legacy_file_id | |
FROM | |
CGHubInfo a | |
FULL JOIN | |
GDCinfo b | |
ON | |
a.aliquot_barcode=b.aliquot_barcode | |
AND a.filename=b.file_name | |
AND a.file_size=b.file_size ), | |
-- | |
-- next we do a LEFT JOIN with a table that contains the GCS urls | |
-- NB: this table will contain ~840 duplicates due to a few cases | |
-- where a particular file is stored twice in GCS | |
-- (this happened for various historical/artifactual reasons) | |
j2 AS ( | |
SELECT | |
case_barcode, | |
sample_barcode, | |
aliquot_barcode, | |
sample_type, | |
analyte_type, | |
experimental_strategy, | |
platform, | |
assembly, | |
bam_filename, | |
bam_size, | |
CGHub_study, | |
CGHub_file_id, | |
GDC_legacy_file_id, | |
bam_gcs_url, | |
bai_gcs_url | |
FROM | |
j1 a | |
LEFT JOIN | |
`isb-cgc.metadata.bams_in_gcs` b | |
ON | |
a.filename=b.bam_filename | |
AND a.file_size=b.bam_size ) | |
SELECT | |
* | |
FROM | |
j2 | |
ORDER BY | |
aliquot_barcode, | |
sample_type, | |
bam_size DESC |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment