Gibbsdavidl/gist:f72b66a3ede779c7cf3672e721baaeae

## gistfile1.txt
WITH
  --
  -- we start by translating the correlations that we got to ranks,
  -- based on sorting the genes on corrByGene "DESC"
  -- this will result in the highest positive correlation getting
  -- rank #1, etc
  -- we also lightly filter the genes by excluding any with near-zero
  -- or negative correlation coefficients, and the result is a list
  -- of approx 9000 genes with symbol, correlation, and rank
  geneScoresT AS (
  SELECT
    gene,
    CorrByGene,
    -- here we do get a DENSE ranking based on ORDERing the
    -- correlations from most positive downward
    ## DENSE_RANK() OVER(ORDER BY corrByGene ASC) AS gene_score
    -- rather than doing a ranking, we could also just try using the
    -- correlations directly, after inverting them so that a
    -- a smaller value is "better"
    (1-CorrByGene) AS gene_score
  FROM
    `isb-cgc-02-0001.Daves_working_area.CPTAC_mRNAseq_corrs`
  WHERE
    CorrByGene > 0.10 ),
  --
  -- next we JOIN the ranks to pathways based on gene symbol;
  -- basically what we are doing is annotating the ranking with the
  -- pathway name
  --
  annotScoresT AS (
  SELECT
    gene,
    corrByGene,
    gene_score,
    pathway
  FROM
    geneScoresT AS a
  JOIN
    `isb-cgc-02-0001.Daves_working_area.WikiPathways_20170425_Annotated` AS b
  ON
    a.gene = b.Symbol ),
  --
  -- Now we're going to "score" the pathways.
  -- For each ID/Name, we sum the scores, count
  -- the number of genes, and then define the score as the sum
  -- of the gene_scores divided by the number of genes
  --
  pathwayScoresT AS (
  SELECT
    pathway,
    SUM(gene_score) AS sumGenesScore,
    COUNT(gene) AS numGenes,
    SUM(gene_score) / COUNT(gene) AS normScore
  FROM
    annotScoresT
  GROUP BY
    pathway )
      --
  -- Finally, we should have our scored pathways!
  -- remember that the lower the score, the better,
  -- so we sort ASCending
  --
SELECT
  *
FROM
  pathwayScoresT
WHERE
  numGenes >= 20
ORDER BY
  normScore ASC
	WITH
	--
	-- we start by translating the correlations that we got to ranks,
	-- based on sorting the genes on corrByGene "DESC"
	-- this will result in the highest positive correlation getting
	-- rank #1, etc
	-- we also lightly filter the genes by excluding any with near-zero
	-- or negative correlation coefficients, and the result is a list
	-- of approx 9000 genes with symbol, correlation, and rank
	geneScoresT AS (
	SELECT
	gene,
	CorrByGene,
	-- here we do get a DENSE ranking based on ORDERing the
	-- correlations from most positive downward
	## DENSE_RANK() OVER(ORDER BY corrByGene ASC) AS gene_score
	-- rather than doing a ranking, we could also just try using the
	-- correlations directly, after inverting them so that a
	-- a smaller value is "better"
	(1-CorrByGene) AS gene_score
	FROM
	`isb-cgc-02-0001.Daves_working_area.CPTAC_mRNAseq_corrs`
	WHERE
	CorrByGene > 0.10 ),
	--
	-- next we JOIN the ranks to pathways based on gene symbol;
	-- basically what we are doing is annotating the ranking with the
	-- pathway name
	--
	annotScoresT AS (
	SELECT
	gene,
	corrByGene,
	gene_score,
	pathway
	FROM
	geneScoresT AS a
	JOIN
	`isb-cgc-02-0001.Daves_working_area.WikiPathways_20170425_Annotated` AS b
	ON
	a.gene = b.Symbol ),
	--
	-- Now we're going to "score" the pathways.
	-- For each ID/Name, we sum the scores, count
	-- the number of genes, and then define the score as the sum
	-- of the gene_scores divided by the number of genes
	--
	pathwayScoresT AS (
	SELECT
	pathway,
	SUM(gene_score) AS sumGenesScore,
	COUNT(gene) AS numGenes,
	SUM(gene_score) / COUNT(gene) AS normScore
	FROM
	annotScoresT
	GROUP BY
	pathway )
	--
	-- Finally, we should have our scored pathways!
	-- remember that the lower the score, the better,
	-- so we sort ASCending
	--
	SELECT
	*
	FROM
	pathwayScoresT
	WHERE
	numGenes >= 20
	ORDER BY
	normScore ASC