Skip to content

Instantly share code, notes, and snippets.

@smrgit
Created November 28, 2017 23:20
Show Gist options
  • Save smrgit/3e4023403a64f639429356b9e762c4f2 to your computer and use it in GitHub Desktop.
Save smrgit/3e4023403a64f639429356b9e762c4f2 to your computer and use it in GitHub Desktop.
Spearman correlation between TCGA expression data and GTEx median tissue-type TPM
WITH
--
-- get the top-5000 most variable genes based on computing the standard
-- deviation on the GTEx "gene_median_tpm" table
GTEx_top5K AS (
SELECT
gene_id,
gene_description,
STDDEV(gene_exp) AS sigmaExp
FROM
`isb-cgc.GTEx_v7.gene_median_tpm`
GROUP BY
1,
2
ORDER BY
sigmaExp DESC
LIMIT
5000 ),
--
-- similarly, get the top-5000 most variable genes from the TCGA data
-- (note that we are using only the Illumina HiSeq expression data)
TCGA_top5K AS (
SELECT
HGNC_gene_symbol,
STDDEV(normalized_count) AS sigmaExp
FROM
`isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM`
WHERE
platform="IlluminaHiSeq"
AND HGNC_gene_symbol IS NOT NULL
GROUP BY
1
ORDER BY
sigmaExp DESC
LIMIT
5000 ),
--
-- now let's join these two lists of 5000 genes so that we can
-- use the intersection as the basis of our downstream analysis;
-- the result is a list of about 3200 highly-variable genes
geneList AS (
SELECT
gene_id,
HGNC_gene_symbol AS gene_symbol
FROM
GTEx_top5K
JOIN
TCGA_top5K
ON
gene_description=HGNC_gene_symbol ),
--
-- now we get the actual TCGA expression data, for *all* samples, but only
-- for genes in our highly-variable list (and only from the Illumina HiSeq platform);
-- we also generate a "dense ranking" of the genes "partitioned by" sample;
-- the result of this query is going to be a rather large table: ~10,340 samples
-- and ~3200 genes --> over 33 million rows!
tcgaData AS (
SELECT
sample_barcode,
project_short_name AS project,
HGNC_gene_symbol AS gene_symbol,
normalized_count AS expr,
DENSE_RANK() OVER (PARTITION BY sample_barcode ORDER BY normalized_count ASC) AS rankExpr
FROM
`isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM`
WHERE
platform="IlluminaHiSeq"
AND HGNC_gene_symbol IN (
SELECT
gene_symbol
FROM
geneList) ),
--
-- and now we repeat the exact same thing with the GTEx data, although the
-- result will be a much smaller table since there are only 53 tissue types
gtexData AS (
SELECT
SMTSD AS tissueType,
gene_description AS gene_symbol,
gene_exp AS expr,
DENSE_RANK() OVER (PARTITION BY SMTSD ORDER BY gene_exp ASC) AS rankExpr
FROM
`isb-cgc.GTEx_v7.gene_median_tpm`
WHERE
gene_description IN (
SELECT
gene_symbol
FROM
geneList ) ),
--
-- now we do a "JOIN" operation between the TCGA expression table and the
-- GTEx expression table, "ON" the gene symbol; the number of output rows
-- will be equal to: # of TCGA samples x # of GTEx tissue types x # of genes
-- 10340 x 53 x 3200 = 1.75 billion rows
--
-- although the BigQuery / SQL query planning an optimization will be able to
-- avoid ever constructing this particular table by splitting this task up and
-- passing the results directly on to the next step which GROUPs by sample and
-- tissue type, thereby producing a much smaller table
j1 AS (
SELECT
g.tissueType AS GTEx_tissueType,
g.gene_symbol,
g.rankExpr AS gRank,
t.sample_barcode,
t.project AS TCGA_project,
t.rankExpr AS tRank
FROM
gtexData g
JOIN
tcgaData t
ON
g.gene_symbol=t.gene_symbol ),
--
-- now that we have the results of the above JOIN, we can perform a CORR() operation
-- using the ranks, GROUPing by sample (and project) and tissue type
--
-- the results of this final step is ~545K correlations, ie ~10300 x 53
gtCorr AS (
SELECT
GTEx_tissueType,
sample_barcode,
TCGA_project,
CORR(gRank,
tRank) AS corr
FROM
j1
GROUP BY
1,
2,
3 )
--
-- this final step could have been included in the previous step, but in case
-- we wanted to do some other final post-processing, we're setting it up like this
SELECT
*
FROM
gtCorr
ORDER BY
corr DESC
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment