smrgit/corr_TCGA_GTEx.sql

## corr_TCGA_GTEx.sql
WITH
  --
  -- get the top-5000 most variable genes based on computing the standard
  -- deviation on the GTEx "gene_median_tpm" table
  GTEx_top5K AS (
  SELECT
    gene_id,
    gene_description,
    STDDEV(gene_exp) AS sigmaExp
  FROM
    `isb-cgc.GTEx_v7.gene_median_tpm`
  GROUP BY
    1,
    2
  ORDER BY
    sigmaExp DESC
  LIMIT
    5000 ),
  --
  -- similarly, get the top-5000 most variable genes from the TCGA data
  -- (note that we are using only the Illumina HiSeq expression data)
  TCGA_top5K AS (
  SELECT
    HGNC_gene_symbol,
    STDDEV(normalized_count) AS sigmaExp
  FROM
    `isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM`
  WHERE
    platform="IlluminaHiSeq"
    AND HGNC_gene_symbol IS NOT NULL
  GROUP BY
    1
  ORDER BY
    sigmaExp DESC
  LIMIT
    5000 ),
  --
  -- now let's join these two lists of 5000 genes so that we can
  -- use the intersection as the basis of our downstream analysis;
  -- the result is a list of about 3200 highly-variable genes
  geneList AS (
  SELECT
    gene_id,
    HGNC_gene_symbol AS gene_symbol
  FROM
    GTEx_top5K
  JOIN
    TCGA_top5K
  ON
    gene_description=HGNC_gene_symbol ),
  --
  -- now we get the actual TCGA expression data, for *all* samples, but only
  -- for genes in our highly-variable list (and only from the Illumina HiSeq platform);
  -- we also generate a "dense ranking" of the genes "partitioned by" sample;
  -- the result of this query is going to be a rather large table: ~10,340 samples
  -- and ~3200 genes --> over 33 million rows!
  tcgaData AS (
  SELECT
    sample_barcode,
    project_short_name AS project,
    HGNC_gene_symbol AS gene_symbol,
    normalized_count AS expr,
    DENSE_RANK() OVER (PARTITION BY sample_barcode ORDER BY normalized_count ASC) AS rankExpr
  FROM
    `isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM`
  WHERE
    platform="IlluminaHiSeq"
    AND HGNC_gene_symbol IN (
    SELECT
      gene_symbol
    FROM
      geneList) ),
  --
  -- and now we repeat the exact same thing with the GTEx data, although the
  -- result will be a much smaller table since there are only 53 tissue types
  gtexData AS (
  SELECT
    SMTSD AS tissueType,
    gene_description AS gene_symbol,
    gene_exp AS expr,
    DENSE_RANK() OVER (PARTITION BY SMTSD ORDER BY gene_exp ASC) AS rankExpr
  FROM
    `isb-cgc.GTEx_v7.gene_median_tpm`
  WHERE
    gene_description IN (
    SELECT
      gene_symbol
    FROM
      geneList ) ),
  --
  -- now we do a "JOIN" operation between the TCGA expression table and the
  -- GTEx expression table, "ON" the gene symbol; the number of output rows
  -- will be equal to: # of TCGA samples x # of GTEx tissue types x # of genes
  --                               10340 x 53 x 3200 = 1.75 billion rows
  --
  -- although the BigQuery / SQL query planning an optimization will be able to
  -- avoid ever constructing this particular table by splitting this task up and
  -- passing the results directly on to the next step which GROUPs by sample and
  -- tissue type, thereby producing a much smaller table
  j1 AS (
  SELECT
    g.tissueType AS GTEx_tissueType,
    g.gene_symbol,
    g.rankExpr AS gRank,
    t.sample_barcode,
    t.project AS TCGA_project,
    t.rankExpr AS tRank
  FROM
    gtexData g
  JOIN
    tcgaData t
  ON
    g.gene_symbol=t.gene_symbol ),
  --
  -- now that we have the results of the above JOIN, we can perform a CORR() operation
  -- using the ranks, GROUPing by sample (and project) and tissue type
  --
  -- the results of this final step is ~545K correlations, ie ~10300 x 53
  gtCorr AS (
  SELECT
    GTEx_tissueType,
    sample_barcode,
    TCGA_project,
    CORR(gRank,
      tRank) AS corr
  FROM
    j1
  GROUP BY
    1,
    2,
    3 )
  --
  -- this final step could have been included in the previous step, but in case
  -- we wanted to do some other final post-processing, we're setting it up like this
SELECT
  *
FROM
  gtCorr
ORDER BY
  corr DESC
	WITH
	--
	-- get the top-5000 most variable genes based on computing the standard
	-- deviation on the GTEx "gene_median_tpm" table
	GTEx_top5K AS (
	SELECT
	gene_id,
	gene_description,
	STDDEV(gene_exp) AS sigmaExp
	FROM
	`isb-cgc.GTEx_v7.gene_median_tpm`
	GROUP BY
	1,
	2
	ORDER BY
	sigmaExp DESC
	LIMIT
	5000 ),
	--
	-- similarly, get the top-5000 most variable genes from the TCGA data
	-- (note that we are using only the Illumina HiSeq expression data)
	TCGA_top5K AS (
	SELECT
	HGNC_gene_symbol,
	STDDEV(normalized_count) AS sigmaExp
	FROM
	`isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM`
	WHERE
	platform="IlluminaHiSeq"
	AND HGNC_gene_symbol IS NOT NULL
	GROUP BY
	1
	ORDER BY
	sigmaExp DESC
	LIMIT
	5000 ),
	--
	-- now let's join these two lists of 5000 genes so that we can
	-- use the intersection as the basis of our downstream analysis;
	-- the result is a list of about 3200 highly-variable genes
	geneList AS (
	SELECT
	gene_id,
	HGNC_gene_symbol AS gene_symbol
	FROM
	GTEx_top5K
	JOIN
	TCGA_top5K
	ON
	gene_description=HGNC_gene_symbol ),
	--
	-- now we get the actual TCGA expression data, for all samples, but only
	-- for genes in our highly-variable list (and only from the Illumina HiSeq platform);
	-- we also generate a "dense ranking" of the genes "partitioned by" sample;
	-- the result of this query is going to be a rather large table: ~10,340 samples
	-- and ~3200 genes --> over 33 million rows!
	tcgaData AS (
	SELECT
	sample_barcode,
	project_short_name AS project,
	HGNC_gene_symbol AS gene_symbol,
	normalized_count AS expr,
	DENSE_RANK() OVER (PARTITION BY sample_barcode ORDER BY normalized_count ASC) AS rankExpr
	FROM
	`isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM`
	WHERE
	platform="IlluminaHiSeq"
	AND HGNC_gene_symbol IN (
	SELECT
	gene_symbol
	FROM
	geneList) ),
	--
	-- and now we repeat the exact same thing with the GTEx data, although the
	-- result will be a much smaller table since there are only 53 tissue types
	gtexData AS (
	SELECT
	SMTSD AS tissueType,
	gene_description AS gene_symbol,
	gene_exp AS expr,
	DENSE_RANK() OVER (PARTITION BY SMTSD ORDER BY gene_exp ASC) AS rankExpr
	FROM
	`isb-cgc.GTEx_v7.gene_median_tpm`
	WHERE
	gene_description IN (
	SELECT
	gene_symbol
	FROM
	geneList ) ),
	--
	-- now we do a "JOIN" operation between the TCGA expression table and the
	-- GTEx expression table, "ON" the gene symbol; the number of output rows
	-- will be equal to: # of TCGA samples x # of GTEx tissue types x # of genes
	-- 10340 x 53 x 3200 = 1.75 billion rows
	--
	-- although the BigQuery / SQL query planning an optimization will be able to
	-- avoid ever constructing this particular table by splitting this task up and
	-- passing the results directly on to the next step which GROUPs by sample and
	-- tissue type, thereby producing a much smaller table
	j1 AS (
	SELECT
	g.tissueType AS GTEx_tissueType,
	g.gene_symbol,
	g.rankExpr AS gRank,
	t.sample_barcode,
	t.project AS TCGA_project,
	t.rankExpr AS tRank
	FROM
	gtexData g
	JOIN
	tcgaData t
	ON
	g.gene_symbol=t.gene_symbol ),
	--
	-- now that we have the results of the above JOIN, we can perform a CORR() operation
	-- using the ranks, GROUPing by sample (and project) and tissue type
	--
	-- the results of this final step is ~545K correlations, ie ~10300 x 53
	gtCorr AS (
	SELECT
	GTEx_tissueType,
	sample_barcode,
	TCGA_project,
	CORR(gRank,
	tRank) AS corr
	FROM
	j1
	GROUP BY
	1,
	2,
	3 )
	--
	-- this final step could have been included in the previous step, but in case
	-- we wanted to do some other final post-processing, we're setting it up like this
	SELECT
	*
	FROM
	gtCorr
	ORDER BY
	corr DESC