smrgit/GO_scoring_v0.sql

## GO_scoring_v0.sql
WITH
  --
  -- we start by translating the correlations that we got to ranks,
  -- based on sorting the genes on corrByGene "DESC"
  -- this will result in the highest positive correlation getting
  -- rank #1, etc
  -- we also lightly filter the genes by excluding any with near-zero
  -- or negative correlation coefficients, and the result is a list
  -- of approx 9000 genes with symbol, correlation, and rank
  geneScoresT AS (
  SELECT
    gene,
    corrByGene,
    -- here we do get a DENSE ranking based on ORDERing the
    -- correlations from most positive downward
    ## DENSE_RANK() OVER(ORDER BY corrByGene ASC) AS gene_score
    -- rather than doing a ranking, we could also just try using the
    -- correlations directly, after inverting them so that a
    -- a smaller value is "better"
    (1-corrByGene) AS gene_score
  FROM
    `isb-cgc.smr_scratch.CPTAC_mRNAseq_corrs`
  WHERE
    corrByGene>0.10 ),
  --
  -- next we JOIN the ranks to GO terms based on gene symbol;
  -- basically what we are doing is annotating the ranking with the
  -- GO_ID, GO_Name and Evidence category:
  --
  annotScoresT AS (
  SELECT
    gene,
    corrByGene,
    gene_score,
    GO_ID,
    GO_Name,
    Evidence
  FROM
    geneScoresT AS a
  JOIN
    `isb-cgc-02-0001.Daves_working_area.go_slim_hgnc` AS b
  ON
    a.gene = b.Symbol ),
  --
  -- Now we're going to "score" the GO gene-sets by grouping on
  -- GO_ID and GO_Name.  For each ID/Name, we sum the ranks, count
  -- the number of genes, and then define the score as the sum
  -- of the ranks divided by the number of genes
  --
  goScoresT AS (
  SELECT
    GO_ID,
    GO_Name,
    SUM(gene_score) AS sumGenesScore,
    COUNT(gene) AS numGenes,
    SUM(gene_score) / COUNT(gene) AS normScore
  FROM
    annotScoresT
  WHERE
    ( Evidence="EXP"
      OR Evidence="IDA"
      OR Evidence="IPI"
      OR Evidence="IMP"
      OR Evidence="IGI"
      OR Evidence="IEP" )
  GROUP BY
    GO_ID,
    GO_Name )
  --
  -- Finally, we should have our scored GO categories!
  -- remember that the lower the score, the better,
  -- so we sort ASCending
  --
SELECT
  *
FROM
  goScoresT
WHERE
  numGenes >= 20
ORDER BY
  normScore ASC
	WITH
	--
	-- we start by translating the correlations that we got to ranks,
	-- based on sorting the genes on corrByGene "DESC"
	-- this will result in the highest positive correlation getting
	-- rank #1, etc
	-- we also lightly filter the genes by excluding any with near-zero
	-- or negative correlation coefficients, and the result is a list
	-- of approx 9000 genes with symbol, correlation, and rank
	geneScoresT AS (
	SELECT
	gene,
	corrByGene,
	-- here we do get a DENSE ranking based on ORDERing the
	-- correlations from most positive downward
	## DENSE_RANK() OVER(ORDER BY corrByGene ASC) AS gene_score
	-- rather than doing a ranking, we could also just try using the
	-- correlations directly, after inverting them so that a
	-- a smaller value is "better"
	(1-corrByGene) AS gene_score
	FROM
	`isb-cgc.smr_scratch.CPTAC_mRNAseq_corrs`
	WHERE
	corrByGene>0.10 ),
	--
	-- next we JOIN the ranks to GO terms based on gene symbol;
	-- basically what we are doing is annotating the ranking with the
	-- GO_ID, GO_Name and Evidence category:
	--
	annotScoresT AS (
	SELECT
	gene,
	corrByGene,
	gene_score,
	GO_ID,
	GO_Name,
	Evidence
	FROM
	geneScoresT AS a
	JOIN
	`isb-cgc-02-0001.Daves_working_area.go_slim_hgnc` AS b
	ON
	a.gene = b.Symbol ),
	--
	-- Now we're going to "score" the GO gene-sets by grouping on
	-- GO_ID and GO_Name. For each ID/Name, we sum the ranks, count
	-- the number of genes, and then define the score as the sum
	-- of the ranks divided by the number of genes
	--
	goScoresT AS (
	SELECT
	GO_ID,
	GO_Name,
	SUM(gene_score) AS sumGenesScore,
	COUNT(gene) AS numGenes,
	SUM(gene_score) / COUNT(gene) AS normScore
	FROM
	annotScoresT
	WHERE
	( Evidence="EXP"
	OR Evidence="IDA"
	OR Evidence="IPI"
	OR Evidence="IMP"
	OR Evidence="IGI"
	OR Evidence="IEP" )
	GROUP BY
	GO_ID,
	GO_Name )
	--
	-- Finally, we should have our scored GO categories!
	-- remember that the lower the score, the better,
	-- so we sort ASCending
	--
	SELECT
	*
	FROM
	goScoresT
	WHERE
	numGenes >= 20
	ORDER BY
	normScore ASC