Skip to content

Instantly share code, notes, and snippets.

@smrgit
Created September 28, 2017 11:44
Show Gist options
  • Save smrgit/eb150c3485f631df16169ad7efa0e8b9 to your computer and use it in GitHub Desktop.
Save smrgit/eb150c3485f631df16169ad7efa0e8b9 to your computer and use it in GitHub Desktop.
join CGHub and GDC legacy information for TCGA DNA bams
WITH
--
-- first, we grab information about the ~48650 TCGA DNA bam files that were at CGHub
-- (and are aligned to GRCh37/HG19 and were classified as "Live")
CGHubInfo AS (
SELECT
SUBSTR(barcode,1,12) AS case_barcode,
SUBSTR(barcode,1,16) AS sample_barcode,
barcode AS aliquot_barcode,
SUBSTR(barcode,14,2) AS sample_type,
analyte_type,
library_type,
platform,
assembly,
filename,
CAST(file_size AS INT64) AS file_size,
analysis_id AS CGHub_file_id,
study AS CGHub_study
FROM
`isb-cgc.tcga_seq_metadata.CGHub_Manifest_24jun2016`
WHERE
file_type="bam"
AND study LIKE "%TCGA%"
AND barcode LIKE "TCGA-%"
AND (assembly LIKE "%37%"
OR assembly LIKE "%19%")
AND state="Live"
AND library_type NOT LIKE "%RNA%"
AND library_type NOT LIKE "Bisulfite%" ),
--
-- next we grab information about the ~44100 TCGA DNA bam files in the
-- GDC legacy archive
GDCinfo AS (
SELECT
SUBSTR(associated_entities__entity_submitter_id,1,12) AS case_barcode,
SUBSTR(associated_entities__entity_submitter_id,1,16) AS sample_barcode,
associated_entities__entity_submitter_id AS aliquot_barcode,
SUBSTR(associated_entities__entity_submitter_id,14,2) AS sample_type,
file_name,
file_size,
experimental_strategy,
file_id AS GDC_legacy_file_id
FROM
`isb-cgc.GDC_metadata.rel8_fileData_legacy`
WHERE
associated_entities__entity_type="aliquot"
AND cases__project__program__name="TCGA"
AND data_format="BAM"
AND data_type="Aligned reads"
AND data_category="Raw sequencing data"
AND experimental_strategy NOT LIKE "%RNA%"
AND experimental_strategy NOT LIKE "%Bisulfite%" ),
--
-- then we do a FULL JOIN on the two tables above, using
-- the COALESCE function to collapse fields that should be
-- the same
-- this results in a table with ~55800 rows, where:
-- the CGHub_file_id is NULL for ~7155
-- the GDC_file_id is NULL for ~11440
-- neither is NULL for ~37220
j1 AS (
SELECT
COALESCE(a.case_barcode,
b.case_barcode) AS case_barcode,
COALESCE(a.sample_barcode,
b.sample_barcode) AS sample_barcode,
COALESCE(a.aliquot_barcode,
b.aliquot_barcode) AS aliquot_barcode,
COALESCE(a.sample_type,
b.sample_type) AS sample_type,
a.analyte_type,
COALESCE(b.experimental_strategy,
a.library_type) AS experimental_strategy,
a.platform,
a.assembly,
COALESCE(a.filename,
b.file_name) AS filename,
COALESCE(a.file_size,
b.file_size) AS file_size,
a.CGHub_study,
a.CGHub_file_id,
b.GDC_legacy_file_id
FROM
CGHubInfo a
FULL JOIN
GDCinfo b
ON
a.aliquot_barcode=b.aliquot_barcode
AND a.filename=b.file_name
AND a.file_size=b.file_size ),
--
-- next we do a LEFT JOIN with a table that contains the GCS urls
-- NB: this table will contain ~840 duplicates due to a few cases
-- where a particular file is stored twice in GCS
-- (this happened for various historical/artifactual reasons)
j2 AS (
SELECT
case_barcode,
sample_barcode,
aliquot_barcode,
sample_type,
analyte_type,
experimental_strategy,
platform,
assembly,
bam_filename,
bam_size,
CGHub_study,
CGHub_file_id,
GDC_legacy_file_id,
bam_gcs_url,
bai_gcs_url
FROM
j1 a
LEFT JOIN
`isb-cgc.metadata.bams_in_gcs` b
ON
a.filename=b.bam_filename
AND a.file_size=b.bam_size )
SELECT
*
FROM
j2
ORDER BY
aliquot_barcode,
sample_type,
bam_size DESC
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment