smrgit/BRCA_CPTAC_RPPA_corr.sql

## BRCA_CPTAC_RPPA_corr.sql
WITH
  -- first we get the 77 samples that passed the QC tests
  qcSet AS (
  SELECT
    TCGA_case_ID AS case_barcode
  FROM
    `isb-cgc.hg19_data_previews.TCGA_Breast_SuppTable01`
  WHERE
    QC_Status="pass" ),
  --
  -- next we extract the sample_barcode, gene, and logRatio from the
  -- CPTAC BRCA proteome iTRAQ table, whlie at the same time doing a
  -- JOIN with the qcSet in order to filter out samples that did not
  -- pass the QC tests
  -- this query returns 720196 avgLogRatio values for 77 samples and
  -- 10599 genes -- on average 9353 genes are detected for each sample
  p AS (
  SELECT
    sample_barcode,
    gene,
    -- the AVG() function is used here for the 3 samples which
    -- were assayed twice; to make sure that we have just one
    -- value per sample per gene
    AVG(unshared_logRatio) AS avgLogRatio
  FROM
    `isb-cgc.hg19_data_previews.TCGA_Breast_BI_Proteom_CDAP_r2_itraq` a
  JOIN
    qcSet b
  ON
    SUBSTR(a.sample_barcode,1,12)=b.case_barcode
  GROUP BY
    sample_barcode,
    gene ),
  --
  -- now we get the RPPA protein data, while filtering down to
  -- the samples in the CPTAC set above ...
  -- from this stage, we will wind up with 16551 rows, with data
  -- for 74 samples, and 227 gene/protein pairs (176 unique genes,
    -- and 214 proteins) -- some of the proteins are phospho-proteins
  -- and in some cases multiple genes may map to a single
  -- protein due to non-specific binding in the RPPA assay
  j AS (
  SELECT
    r.sample_barcode AS sample,
    r.gene_name AS gene,
    r.protein_name AS protein,
    r.protein_expression AS rppaExp,
    p.avgLogRatio AS iTRAQ_logRatio
  FROM
    `isb-cgc.TCGA_hg19_data_v0.Protein_Expression` r
  JOIN
    p
  ON
    SUBSTR(r.sample_barcode,1,15)=SUBSTR(p.sample_barcode,1,15)
    AND r.gene_name=p.gene ),
  --
  -- finally we compute a Pearson correlation on the two
  -- expression values, and keep only those correlations
  -- based on observations in at least 20 samples
  -- (1/4 of the available data)
  c AS (
  SELECT
    CORR(iTRAQ_logRatio,
      rppaExp) AS corrByGeneProt,
    COUNT(*) AS n,
    gene,
    protein
  FROM
    j
  GROUP BY
    gene,
    protein
  HAVING
    n>=20 )
SELECT
  *
FROM
  c
ORDER BY
  corrByGeneProt DESC
	WITH
	-- first we get the 77 samples that passed the QC tests
	qcSet AS (
	SELECT
	TCGA_case_ID AS case_barcode
	FROM
	`isb-cgc.hg19_data_previews.TCGA_Breast_SuppTable01`
	WHERE
	QC_Status="pass" ),
	--
	-- next we extract the sample_barcode, gene, and logRatio from the
	-- CPTAC BRCA proteome iTRAQ table, whlie at the same time doing a
	-- JOIN with the qcSet in order to filter out samples that did not
	-- pass the QC tests
	-- this query returns 720196 avgLogRatio values for 77 samples and
	-- 10599 genes -- on average 9353 genes are detected for each sample
	p AS (
	SELECT
	sample_barcode,
	gene,
	-- the AVG() function is used here for the 3 samples which
	-- were assayed twice; to make sure that we have just one
	-- value per sample per gene
	AVG(unshared_logRatio) AS avgLogRatio
	FROM
	`isb-cgc.hg19_data_previews.TCGA_Breast_BI_Proteom_CDAP_r2_itraq` a
	JOIN
	qcSet b
	ON
	SUBSTR(a.sample_barcode,1,12)=b.case_barcode
	GROUP BY
	sample_barcode,
	gene ),
	--
	-- now we get the RPPA protein data, while filtering down to
	-- the samples in the CPTAC set above ...
	-- from this stage, we will wind up with 16551 rows, with data
	-- for 74 samples, and 227 gene/protein pairs (176 unique genes,
	-- and 214 proteins) -- some of the proteins are phospho-proteins
	-- and in some cases multiple genes may map to a single
	-- protein due to non-specific binding in the RPPA assay
	j AS (
	SELECT
	r.sample_barcode AS sample,
	r.gene_name AS gene,
	r.protein_name AS protein,
	r.protein_expression AS rppaExp,
	p.avgLogRatio AS iTRAQ_logRatio
	FROM
	`isb-cgc.TCGA_hg19_data_v0.Protein_Expression` r
	JOIN
	p
	ON
	SUBSTR(r.sample_barcode,1,15)=SUBSTR(p.sample_barcode,1,15)
	AND r.gene_name=p.gene ),
	--
	-- finally we compute a Pearson correlation on the two
	-- expression values, and keep only those correlations
	-- based on observations in at least 20 samples
	-- (1/4 of the available data)
	c AS (
	SELECT
	CORR(iTRAQ_logRatio,
	rppaExp) AS corrByGeneProt,
	COUNT(*) AS n,
	gene,
	protein
	FROM
	j
	GROUP BY
	gene,
	protein
	HAVING
	n>=20 )
	SELECT
	*
	FROM
	c
	ORDER BY
	corrByGeneProt DESC