Created
May 17, 2017 22:59
-
-
Save Gibbsdavidl/f72b66a3ede779c7cf3672e721baaeae to your computer and use it in GitHub Desktop.
Pathway_scoring_v0.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
WITH | |
-- | |
-- we start by translating the correlations that we got to ranks, | |
-- based on sorting the genes on corrByGene "DESC" | |
-- this will result in the highest positive correlation getting | |
-- rank #1, etc | |
-- we also lightly filter the genes by excluding any with near-zero | |
-- or negative correlation coefficients, and the result is a list | |
-- of approx 9000 genes with symbol, correlation, and rank | |
geneScoresT AS ( | |
SELECT | |
gene, | |
CorrByGene, | |
-- here we do get a DENSE ranking based on ORDERing the | |
-- correlations from most positive downward | |
## DENSE_RANK() OVER(ORDER BY corrByGene ASC) AS gene_score | |
-- rather than doing a ranking, we could also just try using the | |
-- correlations directly, after inverting them so that a | |
-- a smaller value is "better" | |
(1-CorrByGene) AS gene_score | |
FROM | |
`isb-cgc-02-0001.Daves_working_area.CPTAC_mRNAseq_corrs` | |
WHERE | |
CorrByGene > 0.10 ), | |
-- | |
-- next we JOIN the ranks to pathways based on gene symbol; | |
-- basically what we are doing is annotating the ranking with the | |
-- pathway name | |
-- | |
annotScoresT AS ( | |
SELECT | |
gene, | |
corrByGene, | |
gene_score, | |
pathway | |
FROM | |
geneScoresT AS a | |
JOIN | |
`isb-cgc-02-0001.Daves_working_area.WikiPathways_20170425_Annotated` AS b | |
ON | |
a.gene = b.Symbol ), | |
-- | |
-- Now we're going to "score" the pathways. | |
-- For each ID/Name, we sum the scores, count | |
-- the number of genes, and then define the score as the sum | |
-- of the gene_scores divided by the number of genes | |
-- | |
pathwayScoresT AS ( | |
SELECT | |
pathway, | |
SUM(gene_score) AS sumGenesScore, | |
COUNT(gene) AS numGenes, | |
SUM(gene_score) / COUNT(gene) AS normScore | |
FROM | |
annotScoresT | |
GROUP BY | |
pathway ) | |
-- | |
-- Finally, we should have our scored pathways! | |
-- remember that the lower the score, the better, | |
-- so we sort ASCending | |
-- | |
SELECT | |
* | |
FROM | |
pathwayScoresT | |
WHERE | |
numGenes >= 20 | |
ORDER BY | |
normScore ASC |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment