smrgit/gdc_toil_gexp.sql

## gdc_toil_gexp.sql
/*

Copyright 2016, Institute for Systems Biology

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

*/

WITH
  --
  -- *GdcData0*
  -- We start by extracting gene-expression data from the new GDC/hg38-based
  -- table in the isb-cgc:hg38_data_previews dataset.  For each row, we
  -- extract simply the SamplesSubmitterID (aka the TCGA sample barcode),
  -- the Ensembl gene ID (eg ENSG00000182253), and the FPKM value.
  GdcData0 AS (
  SELECT
    SamplesSubmitterID AS sampleID,
    Ensembl_gene_ID AS geneID,
    HTSeq__FPKM AS expFPKM
  FROM
    `isb-cgc.hg38_data_previews.TCGA_GeneExpressionQuantification` ),
  --
  -- *GeneRef*
  -- Next, we're going to get the gene-id to gene-symbol mapping from the
  -- GENCODE reference table because the GDC table reference above contains
  -- only the gene-id and it's always nice to have gene symbols too.
  GeneRef AS (
  SELECT
    gene_id,
    gene_name
  FROM
    `isb-cgc.genome_reference.GENCODE_v24`
  WHERE
    feature="gene" ),
  --
  -- *GdcData*
  -- Now we'll join the two tables above to annotate the GDC expression data
  -- with gene-symbols, and we'll call it GdcData.  We're also going to create
  -- a ranking of the expression values so that we can compute a Spearman
  -- correlation later on.
  GdcData AS (
  SELECT
    GdcData0.sampleID,
    GdcData0.geneID,
    GeneRef.gene_name,
    GdcData0.expFPKM,
    DENSE_RANK() OVER (PARTITION BY GdcData0.geneID ORDER BY GdcData0.expFPKM ASC) AS rankFPKM
  FROM
    GdcData0
  JOIN
    GeneRef
  ON
    GdcData0.geneID = GeneRef.gene_id ),
  --
  -- *ToilData*
  -- Next, we get the RSEM data that was produced by the Toil project. This
  -- table has two values: FPKM and TPM, and we'll select and rank both of them.
  ToilData AS (
  SELECT
    SUBSTR(AliquotBarcode,1,16) AS SampleBarcode,
    gene_id,
    TPM,
    FPKM,
    DENSE_RANK() OVER (PARTITION BY gene_id ORDER BY TPM  ASC) AS rankTPM,
    DENSE_RANK() OVER (PARTITION BY gene_id ORDER BY FPKM ASC) AS rankFPKM
  FROM
    `isb-cgc.Toil_recompute.RNAseq_RSEM_byGene` ),
  --
  -- *JoinAndCorr*
  -- Finally, we join the two tables and compute correlations:
  --     + one pair of correlations (Pearson and Spearman) between the Toil
  --       FPKM and the GDC FPKM values
  --     + one pair of correlations between the two different values that
  --       come from Toil (FPKM and TPM)
  JoinAndCorr AS (
  SELECT
    GdcData.geneID AS gene_id,
    GdcData.gene_name AS gene_name,
    CORR(LOG10(GdcData.expFPKM+1),
      LOG10(ToilData.FPKM+1)) AS pC_GdcToilFPKM,
    CORR(GdcData.rankFPKM,
      ToilData.rankFPKM) AS sC_GdcToilFPKM,
    CORR(LOG10(ToilData.FPKM+1),
      LOG10(ToilData.TPM+1)) AS pC_ToilFPKM_TPM,
    CORR(ToilData.rankFPKM,
      ToilData.rankTPM) AS sC_ToilFPKM_TPM
  FROM
    ToilData
  JOIN
    GdcData
  ON
    GdcData.sampleID=ToilData.SampleBarcode
    AND GdcData.geneID=ToilData.gene_id
  GROUP BY
    GdcData.geneID,
    GdcData.gene_name )
--
-- Our final select simply extracts the correlation values from the preceding
-- table, computes a couple of delta's and returns the genes sorted by the
-- Spearman correlation between the GDC and Toil datasets.
SELECT
  gene_id,
  gene_name,
  pC_GdcToilFPKM,
  sC_GdcToilFPKM,
  (sC_GdcToilFPKM-pC_GdcToilFPKM) AS deltaFPKM,
  pC_ToilFPKM_TPM,
  sC_ToilFPKM_TPM,
  (sC_ToilFPKM_TPM-pC_ToilFPKM_TPM) AS deltaToil
FROM
  JoinAndCorr
WHERE
  IS_NAN(sC_GdcToilFPKM) = FALSE
ORDER BY
  sC_GdcToilFPKM DESC
	/*

	Copyright 2016, Institute for Systems Biology

	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.

	*/

	WITH
	--
	-- GdcData0
	-- We start by extracting gene-expression data from the new GDC/hg38-based
	-- table in the isb-cgc:hg38_data_previews dataset. For each row, we
	-- extract simply the SamplesSubmitterID (aka the TCGA sample barcode),
	-- the Ensembl gene ID (eg ENSG00000182253), and the FPKM value.
	GdcData0 AS (
	SELECT
	SamplesSubmitterID AS sampleID,
	Ensembl_gene_ID AS geneID,
	HTSeq__FPKM AS expFPKM
	FROM
	`isb-cgc.hg38_data_previews.TCGA_GeneExpressionQuantification` ),
	--
	-- GeneRef
	-- Next, we're going to get the gene-id to gene-symbol mapping from the
	-- GENCODE reference table because the GDC table reference above contains
	-- only the gene-id and it's always nice to have gene symbols too.
	GeneRef AS (
	SELECT
	gene_id,
	gene_name
	FROM
	`isb-cgc.genome_reference.GENCODE_v24`
	WHERE
	feature="gene" ),
	--
	-- GdcData
	-- Now we'll join the two tables above to annotate the GDC expression data
	-- with gene-symbols, and we'll call it GdcData. We're also going to create
	-- a ranking of the expression values so that we can compute a Spearman
	-- correlation later on.
	GdcData AS (
	SELECT
	GdcData0.sampleID,
	GdcData0.geneID,
	GeneRef.gene_name,
	GdcData0.expFPKM,
	DENSE_RANK() OVER (PARTITION BY GdcData0.geneID ORDER BY GdcData0.expFPKM ASC) AS rankFPKM
	FROM
	GdcData0
	JOIN
	GeneRef
	ON
	GdcData0.geneID = GeneRef.gene_id ),
	--
	-- ToilData
	-- Next, we get the RSEM data that was produced by the Toil project. This
	-- table has two values: FPKM and TPM, and we'll select and rank both of them.
	ToilData AS (
	SELECT
	SUBSTR(AliquotBarcode,1,16) AS SampleBarcode,
	gene_id,
	TPM,
	FPKM,
	DENSE_RANK() OVER (PARTITION BY gene_id ORDER BY TPM ASC) AS rankTPM,
	DENSE_RANK() OVER (PARTITION BY gene_id ORDER BY FPKM ASC) AS rankFPKM
	FROM
	`isb-cgc.Toil_recompute.RNAseq_RSEM_byGene` ),
	--
	-- JoinAndCorr
	-- Finally, we join the two tables and compute correlations:
	-- + one pair of correlations (Pearson and Spearman) between the Toil
	-- FPKM and the GDC FPKM values
	-- + one pair of correlations between the two different values that
	-- come from Toil (FPKM and TPM)
	JoinAndCorr AS (
	SELECT
	GdcData.geneID AS gene_id,
	GdcData.gene_name AS gene_name,
	CORR(LOG10(GdcData.expFPKM+1),
	LOG10(ToilData.FPKM+1)) AS pC_GdcToilFPKM,
	CORR(GdcData.rankFPKM,
	ToilData.rankFPKM) AS sC_GdcToilFPKM,
	CORR(LOG10(ToilData.FPKM+1),
	LOG10(ToilData.TPM+1)) AS pC_ToilFPKM_TPM,
	CORR(ToilData.rankFPKM,
	ToilData.rankTPM) AS sC_ToilFPKM_TPM
	FROM
	ToilData
	JOIN
	GdcData
	ON
	GdcData.sampleID=ToilData.SampleBarcode
	AND GdcData.geneID=ToilData.gene_id
	GROUP BY
	GdcData.geneID,
	GdcData.gene_name )
	--
	-- Our final select simply extracts the correlation values from the preceding
	-- table, computes a couple of delta's and returns the genes sorted by the
	-- Spearman correlation between the GDC and Toil datasets.
	SELECT
	gene_id,
	gene_name,
	pC_GdcToilFPKM,
	sC_GdcToilFPKM,
	(sC_GdcToilFPKM-pC_GdcToilFPKM) AS deltaFPKM,
	pC_ToilFPKM_TPM,
	sC_ToilFPKM_TPM,
	(sC_ToilFPKM_TPM-pC_ToilFPKM_TPM) AS deltaToil
	FROM
	JoinAndCorr
	WHERE
	IS_NAN(sC_GdcToilFPKM) = FALSE
	ORDER BY
	sC_GdcToilFPKM DESC