Created
November 28, 2017 23:20
-
-
Save smrgit/3e4023403a64f639429356b9e762c4f2 to your computer and use it in GitHub Desktop.
Spearman correlation between TCGA expression data and GTEx median tissue-type TPM
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
WITH | |
-- | |
-- get the top-5000 most variable genes based on computing the standard | |
-- deviation on the GTEx "gene_median_tpm" table | |
GTEx_top5K AS ( | |
SELECT | |
gene_id, | |
gene_description, | |
STDDEV(gene_exp) AS sigmaExp | |
FROM | |
`isb-cgc.GTEx_v7.gene_median_tpm` | |
GROUP BY | |
1, | |
2 | |
ORDER BY | |
sigmaExp DESC | |
LIMIT | |
5000 ), | |
-- | |
-- similarly, get the top-5000 most variable genes from the TCGA data | |
-- (note that we are using only the Illumina HiSeq expression data) | |
TCGA_top5K AS ( | |
SELECT | |
HGNC_gene_symbol, | |
STDDEV(normalized_count) AS sigmaExp | |
FROM | |
`isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM` | |
WHERE | |
platform="IlluminaHiSeq" | |
AND HGNC_gene_symbol IS NOT NULL | |
GROUP BY | |
1 | |
ORDER BY | |
sigmaExp DESC | |
LIMIT | |
5000 ), | |
-- | |
-- now let's join these two lists of 5000 genes so that we can | |
-- use the intersection as the basis of our downstream analysis; | |
-- the result is a list of about 3200 highly-variable genes | |
geneList AS ( | |
SELECT | |
gene_id, | |
HGNC_gene_symbol AS gene_symbol | |
FROM | |
GTEx_top5K | |
JOIN | |
TCGA_top5K | |
ON | |
gene_description=HGNC_gene_symbol ), | |
-- | |
-- now we get the actual TCGA expression data, for *all* samples, but only | |
-- for genes in our highly-variable list (and only from the Illumina HiSeq platform); | |
-- we also generate a "dense ranking" of the genes "partitioned by" sample; | |
-- the result of this query is going to be a rather large table: ~10,340 samples | |
-- and ~3200 genes --> over 33 million rows! | |
tcgaData AS ( | |
SELECT | |
sample_barcode, | |
project_short_name AS project, | |
HGNC_gene_symbol AS gene_symbol, | |
normalized_count AS expr, | |
DENSE_RANK() OVER (PARTITION BY sample_barcode ORDER BY normalized_count ASC) AS rankExpr | |
FROM | |
`isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM` | |
WHERE | |
platform="IlluminaHiSeq" | |
AND HGNC_gene_symbol IN ( | |
SELECT | |
gene_symbol | |
FROM | |
geneList) ), | |
-- | |
-- and now we repeat the exact same thing with the GTEx data, although the | |
-- result will be a much smaller table since there are only 53 tissue types | |
gtexData AS ( | |
SELECT | |
SMTSD AS tissueType, | |
gene_description AS gene_symbol, | |
gene_exp AS expr, | |
DENSE_RANK() OVER (PARTITION BY SMTSD ORDER BY gene_exp ASC) AS rankExpr | |
FROM | |
`isb-cgc.GTEx_v7.gene_median_tpm` | |
WHERE | |
gene_description IN ( | |
SELECT | |
gene_symbol | |
FROM | |
geneList ) ), | |
-- | |
-- now we do a "JOIN" operation between the TCGA expression table and the | |
-- GTEx expression table, "ON" the gene symbol; the number of output rows | |
-- will be equal to: # of TCGA samples x # of GTEx tissue types x # of genes | |
-- 10340 x 53 x 3200 = 1.75 billion rows | |
-- | |
-- although the BigQuery / SQL query planning an optimization will be able to | |
-- avoid ever constructing this particular table by splitting this task up and | |
-- passing the results directly on to the next step which GROUPs by sample and | |
-- tissue type, thereby producing a much smaller table | |
j1 AS ( | |
SELECT | |
g.tissueType AS GTEx_tissueType, | |
g.gene_symbol, | |
g.rankExpr AS gRank, | |
t.sample_barcode, | |
t.project AS TCGA_project, | |
t.rankExpr AS tRank | |
FROM | |
gtexData g | |
JOIN | |
tcgaData t | |
ON | |
g.gene_symbol=t.gene_symbol ), | |
-- | |
-- now that we have the results of the above JOIN, we can perform a CORR() operation | |
-- using the ranks, GROUPing by sample (and project) and tissue type | |
-- | |
-- the results of this final step is ~545K correlations, ie ~10300 x 53 | |
gtCorr AS ( | |
SELECT | |
GTEx_tissueType, | |
sample_barcode, | |
TCGA_project, | |
CORR(gRank, | |
tRank) AS corr | |
FROM | |
j1 | |
GROUP BY | |
1, | |
2, | |
3 ) | |
-- | |
-- this final step could have been included in the previous step, but in case | |
-- we wanted to do some other final post-processing, we're setting it up like this | |
SELECT | |
* | |
FROM | |
gtCorr | |
ORDER BY | |
corr DESC |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment