smrgit/join_CGHub_GDC_legacy_DNA_bams_info.sql

## join_CGHub_GDC_legacy_DNA_bams_info.sql
WITH
  --
  -- first, we grab information about the ~48650 TCGA DNA bam files that were at CGHub
  -- (and are aligned to GRCh37/HG19 and were classified as "Live")
  CGHubInfo AS (
  SELECT
    SUBSTR(barcode,1,12) AS case_barcode,
    SUBSTR(barcode,1,16) AS sample_barcode,
    barcode AS aliquot_barcode,
    SUBSTR(barcode,14,2) AS sample_type,
    analyte_type,
    library_type,
    platform,
    assembly,
    filename,
    CAST(file_size AS INT64) AS file_size,
    analysis_id AS CGHub_file_id,
    study AS CGHub_study
  FROM
    `isb-cgc.tcga_seq_metadata.CGHub_Manifest_24jun2016`
  WHERE
    file_type="bam"
    AND study LIKE "%TCGA%"
    AND barcode LIKE "TCGA-%"
    AND (assembly LIKE "%37%"
      OR assembly LIKE "%19%")
    AND state="Live"
    AND library_type NOT LIKE "%RNA%"
    AND library_type NOT LIKE "Bisulfite%" ),
  --
  -- next we grab information about the ~44100 TCGA DNA bam files in the
  -- GDC legacy archive
  GDCinfo AS (
  SELECT
    SUBSTR(associated_entities__entity_submitter_id,1,12) AS case_barcode,
    SUBSTR(associated_entities__entity_submitter_id,1,16) AS sample_barcode,
    associated_entities__entity_submitter_id AS aliquot_barcode,
    SUBSTR(associated_entities__entity_submitter_id,14,2) AS sample_type,
    file_name,
    file_size,
    experimental_strategy,
    file_id AS GDC_legacy_file_id
  FROM
    `isb-cgc.GDC_metadata.rel8_fileData_legacy`
  WHERE
    associated_entities__entity_type="aliquot"
    AND cases__project__program__name="TCGA"
    AND data_format="BAM"
    AND data_type="Aligned reads"
    AND data_category="Raw sequencing data"
    AND experimental_strategy NOT LIKE "%RNA%"
    AND experimental_strategy NOT LIKE "%Bisulfite%" ),
  --
  -- then we do a FULL JOIN on the two tables above, using
  -- the COALESCE function to collapse fields that should be
  -- the same
  -- this results in a table with ~55800 rows, where:
  -- the CGHub_file_id is NULL for ~7155
  -- the GDC_file_id is NULL for ~11440
  -- neither is NULL for ~37220
  j1 AS (
  SELECT
    COALESCE(a.case_barcode,
      b.case_barcode) AS case_barcode,
    COALESCE(a.sample_barcode,
      b.sample_barcode) AS sample_barcode,
    COALESCE(a.aliquot_barcode,
      b.aliquot_barcode) AS aliquot_barcode,
    COALESCE(a.sample_type,
      b.sample_type) AS sample_type,
    a.analyte_type,
    COALESCE(b.experimental_strategy,
      a.library_type) AS experimental_strategy,
    a.platform,
    a.assembly,
    COALESCE(a.filename,
      b.file_name) AS filename,
    COALESCE(a.file_size,
      b.file_size) AS file_size,
    a.CGHub_study,
    a.CGHub_file_id,
    b.GDC_legacy_file_id
  FROM
    CGHubInfo a
  FULL JOIN
    GDCinfo b
  ON
    a.aliquot_barcode=b.aliquot_barcode
    AND a.filename=b.file_name
    AND a.file_size=b.file_size ),
  --
  -- next we do a LEFT JOIN with a table that contains the GCS urls
  -- NB: this table will contain ~840 duplicates due to a few cases
  -- where a particular file is stored twice in GCS
  -- (this happened for various historical/artifactual reasons)
  j2 AS (
  SELECT
    case_barcode,
    sample_barcode,
    aliquot_barcode,
    sample_type,
    analyte_type,
    experimental_strategy,
    platform,
    assembly,
    bam_filename,
    bam_size,
    CGHub_study,
    CGHub_file_id,
    GDC_legacy_file_id,
    bam_gcs_url,
    bai_gcs_url
  FROM
    j1 a
  LEFT JOIN
    `isb-cgc.metadata.bams_in_gcs` b
  ON
    a.filename=b.bam_filename
    AND a.file_size=b.bam_size )
SELECT
  *
FROM
  j2
ORDER BY
  aliquot_barcode,
  sample_type,
  bam_size DESC
	WITH
	--
	-- first, we grab information about the ~48650 TCGA DNA bam files that were at CGHub
	-- (and are aligned to GRCh37/HG19 and were classified as "Live")
	CGHubInfo AS (
	SELECT
	SUBSTR(barcode,1,12) AS case_barcode,
	SUBSTR(barcode,1,16) AS sample_barcode,
	barcode AS aliquot_barcode,
	SUBSTR(barcode,14,2) AS sample_type,
	analyte_type,
	library_type,
	platform,
	assembly,
	filename,
	CAST(file_size AS INT64) AS file_size,
	analysis_id AS CGHub_file_id,
	study AS CGHub_study
	FROM
	`isb-cgc.tcga_seq_metadata.CGHub_Manifest_24jun2016`
	WHERE
	file_type="bam"
	AND study LIKE "%TCGA%"
	AND barcode LIKE "TCGA-%"
	AND (assembly LIKE "%37%"
	OR assembly LIKE "%19%")
	AND state="Live"
	AND library_type NOT LIKE "%RNA%"
	AND library_type NOT LIKE "Bisulfite%" ),
	--
	-- next we grab information about the ~44100 TCGA DNA bam files in the
	-- GDC legacy archive
	GDCinfo AS (
	SELECT
	SUBSTR(associated_entities__entity_submitter_id,1,12) AS case_barcode,
	SUBSTR(associated_entities__entity_submitter_id,1,16) AS sample_barcode,
	associated_entities__entity_submitter_id AS aliquot_barcode,
	SUBSTR(associated_entities__entity_submitter_id,14,2) AS sample_type,
	file_name,
	file_size,
	experimental_strategy,
	file_id AS GDC_legacy_file_id
	FROM
	`isb-cgc.GDC_metadata.rel8_fileData_legacy`
	WHERE
	associated_entities__entity_type="aliquot"
	AND cases__project__program__name="TCGA"
	AND data_format="BAM"
	AND data_type="Aligned reads"
	AND data_category="Raw sequencing data"
	AND experimental_strategy NOT LIKE "%RNA%"
	AND experimental_strategy NOT LIKE "%Bisulfite%" ),
	--
	-- then we do a FULL JOIN on the two tables above, using
	-- the COALESCE function to collapse fields that should be
	-- the same
	-- this results in a table with ~55800 rows, where:
	-- the CGHub_file_id is NULL for ~7155
	-- the GDC_file_id is NULL for ~11440
	-- neither is NULL for ~37220
	j1 AS (
	SELECT
	COALESCE(a.case_barcode,
	b.case_barcode) AS case_barcode,
	COALESCE(a.sample_barcode,
	b.sample_barcode) AS sample_barcode,
	COALESCE(a.aliquot_barcode,
	b.aliquot_barcode) AS aliquot_barcode,
	COALESCE(a.sample_type,
	b.sample_type) AS sample_type,
	a.analyte_type,
	COALESCE(b.experimental_strategy,
	a.library_type) AS experimental_strategy,
	a.platform,
	a.assembly,
	COALESCE(a.filename,
	b.file_name) AS filename,
	COALESCE(a.file_size,
	b.file_size) AS file_size,
	a.CGHub_study,
	a.CGHub_file_id,
	b.GDC_legacy_file_id
	FROM
	CGHubInfo a
	FULL JOIN
	GDCinfo b
	ON
	a.aliquot_barcode=b.aliquot_barcode
	AND a.filename=b.file_name
	AND a.file_size=b.file_size ),
	--
	-- next we do a LEFT JOIN with a table that contains the GCS urls
	-- NB: this table will contain ~840 duplicates due to a few cases
	-- where a particular file is stored twice in GCS
	-- (this happened for various historical/artifactual reasons)
	j2 AS (
	SELECT
	case_barcode,
	sample_barcode,
	aliquot_barcode,
	sample_type,
	analyte_type,
	experimental_strategy,
	platform,
	assembly,
	bam_filename,
	bam_size,
	CGHub_study,
	CGHub_file_id,
	GDC_legacy_file_id,
	bam_gcs_url,
	bai_gcs_url
	FROM
	j1 a
	LEFT JOIN
	`isb-cgc.metadata.bams_in_gcs` b
	ON
	a.filename=b.bam_filename
	AND a.file_size=b.bam_size )
	SELECT
	*
	FROM
	j2
	ORDER BY
	aliquot_barcode,
	sample_type,
	bam_size DESC