Skip to content

Instantly share code, notes, and snippets.

@Gibbsdavidl
Created February 27, 2018 02:06
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save Gibbsdavidl/5580a4cb7834e2658613bb7bec07ae49 to your computer and use it in GitHub Desktop.
BigQuery to perform Spearman's correlation over all pairs of genes given Reactome pathway definitions.
WITH
--
--
--
geneList AS (
SELECT
a.entrez AS Entrez,
Symbol AS HGNC_gene_symbol
FROM
`isb-cgc.QotM.Reactome_a1` a
JOIN
`isb-cgc.QotM.org_Hs_eg_db_v1` b
ON
a.entrez = b.Entrez
WHERE
pathwayName = 'CELL CYCLE'
GROUP BY
Symbol,
Entrez
ORDER BY
rand()
LIMIT 10),
--
--
geneAnnot AS (
SELECT
Entrez,
HGNC_gene_symbol,
seq_name,
MIN(start) AS geneStart
FROM
`isb-cgc.genome_reference.GENCODE_v24` a
JOIN
geneList
ON
geneList.HGNC_gene_symbol = a.gene_name
GROUP BY
Entrez,
HGNC_gene_symbol,
seq_name ),
--
--
--
cohortExpr1 AS (
SELECT
sample_barcode,
a.HGNC_gene_symbol,
seq_name,
geneStart,
RANK() OVER (PARTITION BY a.HGNC_gene_symbol ORDER BY normalized_count ASC) AS expr_rank
FROM
`isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM` a
JOIN
geneAnnot
ON
geneAnnot.HGNC_gene_symbol = a.HGNC_gene_symbol
WHERE
project_short_name = 'TCGA-LUAD'
AND normalized_count IS NOT NULL
AND normalized_count > 10
AND ENDS_WITH(sample_barcode, '-01A') ),
--
--
--
cohortExpr2 AS (
SELECT
sample_barcode,
a.HGNC_gene_symbol,
seq_name,
geneStart,
RANK() OVER (PARTITION BY a.HGNC_gene_symbol ORDER BY normalized_count ASC) AS expr_rank
FROM
`isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM` a
JOIN
geneAnnot
ON
geneAnnot.HGNC_gene_symbol = a.HGNC_gene_symbol
WHERE
project_short_name = 'TCGA-LUAD'
AND normalized_count IS NOT NULL
AND normalized_count > 0
AND ENDS_WITH(sample_barcode, '-01A') ),
--
--
--
jtab AS (
SELECT
cohortExpr1.sample_barcode,
cohortExpr2.sample_barcode,
cohortExpr1.HGNC_gene_symbol AS geneA,
cohortExpr1.seq_name AS chrA,
cohortExpr1.geneStart AS startA,
cohortExpr2.HGNC_gene_symbol AS geneB,
cohortExpr2.seq_name AS chrB,
cohortExpr2.geneStart AS startB,
cohortExpr1.expr_rank AS e1,
cohortExpr2.expr_rank AS e2
FROM
cohortExpr1
JOIN
cohortExpr2
ON
cohortExpr1.sample_barcode = cohortExpr2.sample_barcode
WHERE
cohortExpr1.HGNC_gene_symbol <> cohortExpr2.HGNC_gene_symbol
AND cohortExpr1.HGNC_gene_symbol > cohortExpr2.HGNC_gene_symbol
GROUP BY
cohortExpr1.sample_barcode,
cohortExpr2.sample_barcode,
cohortExpr1.HGNC_gene_symbol,
cohortExpr2.HGNC_gene_symbol,
cohortExpr1.seq_name,
cohortExpr1.geneStart,
cohortExpr2.seq_name,
cohortExpr2.geneStart,
cohortExpr1.expr_rank,
cohortExpr2.expr_rank ),
--
--
finalResult AS (
SELECT
geneA,
chrA,
startA,
geneB,
chrB,
startB,
corr(e1,
e2) AS spearmans
FROM
jtab
GROUP BY
geneA,
chrA,
startA,
geneB,
chrB,
startB
ORDER BY
ABS(spearmans) DESC
LIMIT
100
)
select * from finalResult
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment