smrgit/find_normal_bams_in_gcs.sql

## find_normal_bams_in_gcs.sql
WITH
  --
  -- first, we grab information about the ~48650 TCGA DNA bam files that were at CGHub
  -- (and are aligned to GRCh37/HG19 and were classified as "Live")
  dbGaPinfo AS (
  SELECT
    SUBSTR(barcode,1,12) AS case_barcode,
    SUBSTR(barcode,1,16) AS sample_barcode,
    barcode AS aliquot_barcode,
    SUBSTR(barcode,14,2) AS sample_type,
    analyte_type,
    library_type,
    platform,
    assembly,
    filename,
    CAST(file_size AS INT64) AS file_size,
    analysis_id AS CGHub_file_id
  FROM
    `isb-cgc.tcga_seq_metadata.CGHub_Manifest_24jun2016`
  WHERE
    file_type="bam"
    AND study LIKE "%TCGA%"
    AND barcode LIKE "TCGA-%"
    AND (assembly LIKE "%37%"
      OR assembly LIKE "%19%")
    AND state="Live"
    AND library_type NOT LIKE "%RNA%"
    AND library_type NOT LIKE "Bisulfite%" ),
  --
  -- next we grab information about the ~44100 TCGA DNA bam files in the
  -- GDC legacy archive
  GDCinfo AS (
  SELECT
    SUBSTR(associated_entities__entity_submitter_id,1,12) AS case_barcode,
    SUBSTR(associated_entities__entity_submitter_id,1,16) AS sample_barcode,
    associated_entities__entity_submitter_id AS aliquot_barcode,
    SUBSTR(associated_entities__entity_submitter_id,14,2) AS sample_type,
    file_name,
    file_size,
    experimental_strategy,
    file_id AS GDC_file_id
  FROM
    `isb-cgc.GDC_metadata.rel8_fileData_legacy`
  WHERE
    associated_entities__entity_type="aliquot"
    AND cases__project__program__name="TCGA"
    AND data_format="BAM"
    AND data_type="Aligned reads"
    AND data_category="Raw sequencing data"
    AND experimental_strategy NOT LIKE "%RNA%"
    AND experimental_strategy NOT LIKE "%Bisulfite%" ),
  --
  -- then we do a FULL JOIN on the two tables above, using
  -- the COALESCE function to collapse fields that should be
  -- the same
  -- this results in a table with ~55800 rows, where:
  -- the CGHub_file_id is NULL for ~7155
  -- the GDC_file_id is NULL for ~11440
  -- neither is NULL for ~37220
  j1 AS (
  SELECT
    COALESCE(a.case_barcode,
      b.case_barcode) AS case_barcode,
    COALESCE(a.sample_barcode,
      b.sample_barcode) AS sample_barcode,
    COALESCE(a.aliquot_barcode,
      b.aliquot_barcode) AS aliquot_barcode,
    COALESCE(a.sample_type,
      b.sample_type) AS sample_type,
    a.analyte_type,
    COALESCE(b.experimental_strategy,
      a.library_type) AS experimental_strategy,
    a.platform,
    a.assembly,
    COALESCE(a.filename,
      b.file_name) AS filename,
    COALESCE(a.file_size,
      b.file_size) AS file_size,
    a.CGHub_file_id,
    b.GDC_file_id
  FROM
    dbGaPinfo a
  FULL JOIN
    GDCinfo b
  ON
    a.aliquot_barcode=b.aliquot_barcode
    AND a.filename=b.file_name
    AND a.file_size=b.file_size ),
  --
  -- next we do a LEFT JOIN with a table that contains the GCS urls
  -- NB: this table will contain ~840 duplicates due to a few cases
  -- where a particular file is stored twice in GCS
  -- (this happened for various historical/artifactual reasons)
  j2 AS (
  SELECT
    case_barcode,
    sample_barcode,
    aliquot_barcode,
    sample_type,
    analyte_type,
    experimental_strategy,
    platform,
    assembly,
    bam_filename,
    bam_size,
    CGHub_file_id,
    GDC_file_id,
    bam_gcs_url,
    bai_gcs_url
  FROM
    j1 a
  LEFT JOIN
    `isb-cgc.metadata.bams_in_gcs` b
  ON
    a.filename=b.bam_filename
    AND a.file_size=b.bam_size ),
  --
  -- the preceding j2 table is the main result that we want, which
  -- could be saved for further querying, but for now let's do some
  -- filtering directly on that result ...
  -- here we want to count up the # of BAM files for normal samples,
  -- by case, that are not WGS BAMS and *are* ILLUMINA bams:
  -- (result is a table with ~18100 rows)
  f1 AS (
  SELECT
    case_barcode,
    experimental_strategy,
    platform,
    assembly,
    COUNT(*) AS n
  FROM
    j2
  WHERE
    sample_type LIKE "1%"
    AND experimental_strategy<>"WGS"
    AND platform="ILLUMINA"
  GROUP BY
    1,
    2,
    3,
    4 )
  --
  -- and finally, from the above, we want to know how many
  -- distinct cases we have at least one normal BAM for:
SELECT
  DISTINCT case_barcode
FROM
  f1
ORDER BY
  case_barcode
	WITH
	--
	-- first, we grab information about the ~48650 TCGA DNA bam files that were at CGHub
	-- (and are aligned to GRCh37/HG19 and were classified as "Live")
	dbGaPinfo AS (
	SELECT
	SUBSTR(barcode,1,12) AS case_barcode,
	SUBSTR(barcode,1,16) AS sample_barcode,
	barcode AS aliquot_barcode,
	SUBSTR(barcode,14,2) AS sample_type,
	analyte_type,
	library_type,
	platform,
	assembly,
	filename,
	CAST(file_size AS INT64) AS file_size,
	analysis_id AS CGHub_file_id
	FROM
	`isb-cgc.tcga_seq_metadata.CGHub_Manifest_24jun2016`
	WHERE
	file_type="bam"
	AND study LIKE "%TCGA%"
	AND barcode LIKE "TCGA-%"
	AND (assembly LIKE "%37%"
	OR assembly LIKE "%19%")
	AND state="Live"
	AND library_type NOT LIKE "%RNA%"
	AND library_type NOT LIKE "Bisulfite%" ),
	--
	-- next we grab information about the ~44100 TCGA DNA bam files in the
	-- GDC legacy archive
	GDCinfo AS (
	SELECT
	SUBSTR(associated_entities__entity_submitter_id,1,12) AS case_barcode,
	SUBSTR(associated_entities__entity_submitter_id,1,16) AS sample_barcode,
	associated_entities__entity_submitter_id AS aliquot_barcode,
	SUBSTR(associated_entities__entity_submitter_id,14,2) AS sample_type,
	file_name,
	file_size,
	experimental_strategy,
	file_id AS GDC_file_id
	FROM
	`isb-cgc.GDC_metadata.rel8_fileData_legacy`
	WHERE
	associated_entities__entity_type="aliquot"
	AND cases__project__program__name="TCGA"
	AND data_format="BAM"
	AND data_type="Aligned reads"
	AND data_category="Raw sequencing data"
	AND experimental_strategy NOT LIKE "%RNA%"
	AND experimental_strategy NOT LIKE "%Bisulfite%" ),
	--
	-- then we do a FULL JOIN on the two tables above, using
	-- the COALESCE function to collapse fields that should be
	-- the same
	-- this results in a table with ~55800 rows, where:
	-- the CGHub_file_id is NULL for ~7155
	-- the GDC_file_id is NULL for ~11440
	-- neither is NULL for ~37220
	j1 AS (
	SELECT
	COALESCE(a.case_barcode,
	b.case_barcode) AS case_barcode,
	COALESCE(a.sample_barcode,
	b.sample_barcode) AS sample_barcode,
	COALESCE(a.aliquot_barcode,
	b.aliquot_barcode) AS aliquot_barcode,
	COALESCE(a.sample_type,
	b.sample_type) AS sample_type,
	a.analyte_type,
	COALESCE(b.experimental_strategy,
	a.library_type) AS experimental_strategy,
	a.platform,
	a.assembly,
	COALESCE(a.filename,
	b.file_name) AS filename,
	COALESCE(a.file_size,
	b.file_size) AS file_size,
	a.CGHub_file_id,
	b.GDC_file_id
	FROM
	dbGaPinfo a
	FULL JOIN
	GDCinfo b
	ON
	a.aliquot_barcode=b.aliquot_barcode
	AND a.filename=b.file_name
	AND a.file_size=b.file_size ),
	--
	-- next we do a LEFT JOIN with a table that contains the GCS urls
	-- NB: this table will contain ~840 duplicates due to a few cases
	-- where a particular file is stored twice in GCS
	-- (this happened for various historical/artifactual reasons)
	j2 AS (
	SELECT
	case_barcode,
	sample_barcode,
	aliquot_barcode,
	sample_type,
	analyte_type,
	experimental_strategy,
	platform,
	assembly,
	bam_filename,
	bam_size,
	CGHub_file_id,
	GDC_file_id,
	bam_gcs_url,
	bai_gcs_url
	FROM
	j1 a
	LEFT JOIN
	`isb-cgc.metadata.bams_in_gcs` b
	ON
	a.filename=b.bam_filename
	AND a.file_size=b.bam_size ),
	--
	-- the preceding j2 table is the main result that we want, which
	-- could be saved for further querying, but for now let's do some
	-- filtering directly on that result ...
	-- here we want to count up the # of BAM files for normal samples,
	-- by case, that are not WGS BAMS and are ILLUMINA bams:
	-- (result is a table with ~18100 rows)
	f1 AS (
	SELECT
	case_barcode,
	experimental_strategy,
	platform,
	assembly,
	COUNT(*) AS n
	FROM
	j2
	WHERE
	sample_type LIKE "1%"
	AND experimental_strategy<>"WGS"
	AND platform="ILLUMINA"
	GROUP BY
	1,
	2,
	3,
	4 )
	--
	-- and finally, from the above, we want to know how many
	-- distinct cases we have at least one normal BAM for:
	SELECT
	DISTINCT case_barcode
	FROM
	f1
	ORDER BY
	case_barcode