Last active
December 7, 2016 23:48
-
-
Save smrgit/7cbef49b8e772e9289a9454b3f65bc9b to your computer and use it in GitHub Desktop.
BigQuery standard SQL query for correlating GDC/HTSeq gene expression values with Toil/RSEM gene expression values.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Copyright 2016, Institute for Systems Biology | |
Licensed under the Apache License, Version 2.0 (the "License"); | |
you may not use this file except in compliance with the License. | |
You may obtain a copy of the License at | |
http://www.apache.org/licenses/LICENSE-2.0 | |
Unless required by applicable law or agreed to in writing, software | |
distributed under the License is distributed on an "AS IS" BASIS, | |
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
See the License for the specific language governing permissions and | |
limitations under the License. | |
*/ | |
WITH | |
-- | |
-- *GdcData0* | |
-- We start by extracting gene-expression data from the new GDC/hg38-based | |
-- table in the isb-cgc:hg38_data_previews dataset. For each row, we | |
-- extract simply the SamplesSubmitterID (aka the TCGA sample barcode), | |
-- the Ensembl gene ID (eg ENSG00000182253), and the FPKM value. | |
GdcData0 AS ( | |
SELECT | |
SamplesSubmitterID AS sampleID, | |
Ensembl_gene_ID AS geneID, | |
HTSeq__FPKM AS expFPKM | |
FROM | |
`isb-cgc.hg38_data_previews.TCGA_GeneExpressionQuantification` ), | |
-- | |
-- *GeneRef* | |
-- Next, we're going to get the gene-id to gene-symbol mapping from the | |
-- GENCODE reference table because the GDC table reference above contains | |
-- only the gene-id and it's always nice to have gene symbols too. | |
GeneRef AS ( | |
SELECT | |
gene_id, | |
gene_name | |
FROM | |
`isb-cgc.genome_reference.GENCODE_v24` | |
WHERE | |
feature="gene" ), | |
-- | |
-- *GdcData* | |
-- Now we'll join the two tables above to annotate the GDC expression data | |
-- with gene-symbols, and we'll call it GdcData. We're also going to create | |
-- a ranking of the expression values so that we can compute a Spearman | |
-- correlation later on. | |
GdcData AS ( | |
SELECT | |
GdcData0.sampleID, | |
GdcData0.geneID, | |
GeneRef.gene_name, | |
GdcData0.expFPKM, | |
DENSE_RANK() OVER (PARTITION BY GdcData0.geneID ORDER BY GdcData0.expFPKM ASC) AS rankFPKM | |
FROM | |
GdcData0 | |
JOIN | |
GeneRef | |
ON | |
GdcData0.geneID = GeneRef.gene_id ), | |
-- | |
-- *ToilData* | |
-- Next, we get the RSEM data that was produced by the Toil project. This | |
-- table has two values: FPKM and TPM, and we'll select and rank both of them. | |
ToilData AS ( | |
SELECT | |
SUBSTR(AliquotBarcode,1,16) AS SampleBarcode, | |
gene_id, | |
TPM, | |
FPKM, | |
DENSE_RANK() OVER (PARTITION BY gene_id ORDER BY TPM ASC) AS rankTPM, | |
DENSE_RANK() OVER (PARTITION BY gene_id ORDER BY FPKM ASC) AS rankFPKM | |
FROM | |
`isb-cgc.Toil_recompute.RNAseq_RSEM_byGene` ), | |
-- | |
-- *JoinAndCorr* | |
-- Finally, we join the two tables and compute correlations: | |
-- + one pair of correlations (Pearson and Spearman) between the Toil | |
-- FPKM and the GDC FPKM values | |
-- + one pair of correlations between the two different values that | |
-- come from Toil (FPKM and TPM) | |
JoinAndCorr AS ( | |
SELECT | |
GdcData.geneID AS gene_id, | |
GdcData.gene_name AS gene_name, | |
CORR(LOG10(GdcData.expFPKM+1), | |
LOG10(ToilData.FPKM+1)) AS pC_GdcToilFPKM, | |
CORR(GdcData.rankFPKM, | |
ToilData.rankFPKM) AS sC_GdcToilFPKM, | |
CORR(LOG10(ToilData.FPKM+1), | |
LOG10(ToilData.TPM+1)) AS pC_ToilFPKM_TPM, | |
CORR(ToilData.rankFPKM, | |
ToilData.rankTPM) AS sC_ToilFPKM_TPM | |
FROM | |
ToilData | |
JOIN | |
GdcData | |
ON | |
GdcData.sampleID=ToilData.SampleBarcode | |
AND GdcData.geneID=ToilData.gene_id | |
GROUP BY | |
GdcData.geneID, | |
GdcData.gene_name ) | |
-- | |
-- Our final select simply extracts the correlation values from the preceding | |
-- table, computes a couple of delta's and returns the genes sorted by the | |
-- Spearman correlation between the GDC and Toil datasets. | |
SELECT | |
gene_id, | |
gene_name, | |
pC_GdcToilFPKM, | |
sC_GdcToilFPKM, | |
(sC_GdcToilFPKM-pC_GdcToilFPKM) AS deltaFPKM, | |
pC_ToilFPKM_TPM, | |
sC_ToilFPKM_TPM, | |
(sC_ToilFPKM_TPM-pC_ToilFPKM_TPM) AS deltaToil | |
FROM | |
JoinAndCorr | |
WHERE | |
IS_NAN(sC_GdcToilFPKM) = FALSE | |
ORDER BY | |
sC_GdcToilFPKM DESC |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment