smrgit/find_AML_BAM_files.sql

## find_AML_BAM_files.sql
WITH
  --
  -- this initial table collects the case barcodes for all TCGA LAML and
  -- TARGET AML cases from the two "Clinical" tables
  -- Result: this sub-query returns a table with 1193 rows
  t1 AS (
  SELECT
    program_name,
    case_barcode
  FROM
    `isb-cgc.TCGA_bioclin_v0.Clinical`
  WHERE
    project_short_name="TCGA-LAML"
  UNION ALL
  SELECT
    program_name,
    case_barcode
  FROM
    `isb-cgc.TARGET_bioclin_v0.Clinical`
  WHERE
    project_short_name="TARGET-AML" ),
  --
  -- next, we use the GDC "cases" metadata table in order to get the GDC
  -- case identifiers assigned to the case barcodes (aka 'submitter ids')
  -- NB: this could also be done with a JOIN, but instead we use the
  -- "WHERE ... IN ... " operator just because it can be a handy tool in
  -- many situations
  -- Result: this sub-query returns a table with 1135 rows, so 58 cases
  -- from the "Clinical" table(s) were not found in the GDC metadata
  t2 AS (
  SELECT
    case_id,
    submitter_id
  FROM
    `isb-cgc.GDC_metadata.rel8_caseData`
  WHERE
    submitter_id IN (
    SELECT
      case_barcode
    FROM
      t1) ),
  --
  -- next, we'll join the previous two tables
  -- Result: this operation returns a table with 1135 rows, each containing
  -- the program_name (eg "TCGA"), the case_barcode (eg "TCGA-AB-2869")
  -- and the case_id (eg "e44e2ac3-4a83-40b0-b266-a36626ffd6ae")
  j1 AS (
  SELECT
    t1.program_name,
    t1.case_barcode,
    t2.case_id
  FROM
    t1
  JOIN
    t2
  ON
    t1.case_barcode=t2.submitter_id),
  --
  -- next, we want to use the case_ids from above to find information
  -- about available BAM files  --  from the 'current' GDC archive, not
  -- the legacy archive
  -- Result: this step returns a table with 1150 rows -- some cases have
  -- multiple associated BAM files, some have none
  t3 AS (
  SELECT
    file_id,
    cases__project__project_id AS project_id,
    associated_entities__case_id,
    acl,
    experimental_strategy,
    analysis__workflow_type,
    data_category,
    data_format,
    data_type
  FROM
    `isb-cgc.GDC_metadata.rel8_fileData_current`
  WHERE
    data_format="BAM"
    AND associated_entities__case_id IN (
    SELECT
      case_id
    FROM
      j1 ) ),
  --
  -- and finally we will JOIN this list of BAM files with a table
  -- containing a mapping from the GDC file_id to the ISB-CGC
  -- bucket path in GCS (Google Cloud Storage)
  -- Result: the table produced by this step also has 1150 rows,
  -- indicating that every file previously identified is in fact
  -- available in an ISB-CGC bucket
  j2 AS (
  SELECT
    *
  FROM
    t3 a
  JOIN
    `isb-cgc.GDC_metadata.rel8_GDCfileID_to_GCSurl` b
  ON
    a.file_id=b.file_gdc_id )
  --
  -- Perhaps at this point, the user might want to know more about these BAM files,
  -- and might, for example (as shown below), count up the BAM files based on
  -- information such as the ACL, project id, experimental strategy, and
  -- analysis workflow type
SELECT
  acl,
  project_id,
  experimental_strategy,
  analysis__workflow_type,
  COUNT(*) AS n
FROM
  j2
GROUP BY
  acl,
  project_id,
  experimental_strategy,
  analysis__workflow_type
ORDER BY
  experimental_strategy,
  analysis__workflow_type,
  n DESC
	WITH
	--
	-- this initial table collects the case barcodes for all TCGA LAML and
	-- TARGET AML cases from the two "Clinical" tables
	-- Result: this sub-query returns a table with 1193 rows
	t1 AS (
	SELECT
	program_name,
	case_barcode
	FROM
	`isb-cgc.TCGA_bioclin_v0.Clinical`
	WHERE
	project_short_name="TCGA-LAML"
	UNION ALL
	SELECT
	program_name,
	case_barcode
	FROM
	`isb-cgc.TARGET_bioclin_v0.Clinical`
	WHERE
	project_short_name="TARGET-AML" ),
	--
	-- next, we use the GDC "cases" metadata table in order to get the GDC
	-- case identifiers assigned to the case barcodes (aka 'submitter ids')
	-- NB: this could also be done with a JOIN, but instead we use the
	-- "WHERE ... IN ... " operator just because it can be a handy tool in
	-- many situations
	-- Result: this sub-query returns a table with 1135 rows, so 58 cases
	-- from the "Clinical" table(s) were not found in the GDC metadata
	t2 AS (
	SELECT
	case_id,
	submitter_id
	FROM
	`isb-cgc.GDC_metadata.rel8_caseData`
	WHERE
	submitter_id IN (
	SELECT
	case_barcode
	FROM
	t1) ),
	--
	-- next, we'll join the previous two tables
	-- Result: this operation returns a table with 1135 rows, each containing
	-- the program_name (eg "TCGA"), the case_barcode (eg "TCGA-AB-2869")
	-- and the case_id (eg "e44e2ac3-4a83-40b0-b266-a36626ffd6ae")
	j1 AS (
	SELECT
	t1.program_name,
	t1.case_barcode,
	t2.case_id
	FROM
	t1
	JOIN
	t2
	ON
	t1.case_barcode=t2.submitter_id),
	--
	-- next, we want to use the case_ids from above to find information
	-- about available BAM files -- from the 'current' GDC archive, not
	-- the legacy archive
	-- Result: this step returns a table with 1150 rows -- some cases have
	-- multiple associated BAM files, some have none
	t3 AS (
	SELECT
	file_id,
	cases__project__project_id AS project_id,
	associated_entities__case_id,
	acl,
	experimental_strategy,
	analysis__workflow_type,
	data_category,
	data_format,
	data_type
	FROM
	`isb-cgc.GDC_metadata.rel8_fileData_current`
	WHERE
	data_format="BAM"
	AND associated_entities__case_id IN (
	SELECT
	case_id
	FROM
	j1 ) ),
	--
	-- and finally we will JOIN this list of BAM files with a table
	-- containing a mapping from the GDC file_id to the ISB-CGC
	-- bucket path in GCS (Google Cloud Storage)
	-- Result: the table produced by this step also has 1150 rows,
	-- indicating that every file previously identified is in fact
	-- available in an ISB-CGC bucket
	j2 AS (
	SELECT
	*
	FROM
	t3 a
	JOIN
	`isb-cgc.GDC_metadata.rel8_GDCfileID_to_GCSurl` b
	ON
	a.file_id=b.file_gdc_id )
	--
	-- Perhaps at this point, the user might want to know more about these BAM files,
	-- and might, for example (as shown below), count up the BAM files based on
	-- information such as the ACL, project id, experimental strategy, and
	-- analysis workflow type
	SELECT
	acl,
	project_id,
	experimental_strategy,
	analysis__workflow_type,
	COUNT(*) AS n
	FROM
	j2
	GROUP BY
	acl,
	project_id,
	experimental_strategy,
	analysis__workflow_type
	ORDER BY
	experimental_strategy,
	analysis__workflow_type,
	n DESC