Last active
March 22, 2017 15:42
-
-
Save smrgit/be253b7f22a96a25711f835adb0d3562 to your computer and use it in GitHub Desktop.
BRCA CPTAC-RPPA correlation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
WITH | |
-- first we get the 77 samples that passed the QC tests | |
qcSet AS ( | |
SELECT | |
TCGA_case_ID AS case_barcode | |
FROM | |
`isb-cgc.hg19_data_previews.TCGA_Breast_SuppTable01` | |
WHERE | |
QC_Status="pass" ), | |
-- | |
-- next we extract the sample_barcode, gene, and logRatio from the | |
-- CPTAC BRCA proteome iTRAQ table, whlie at the same time doing a | |
-- JOIN with the qcSet in order to filter out samples that did not | |
-- pass the QC tests | |
-- this query returns 720196 avgLogRatio values for 77 samples and | |
-- 10599 genes -- on average 9353 genes are detected for each sample | |
p AS ( | |
SELECT | |
sample_barcode, | |
gene, | |
-- the AVG() function is used here for the 3 samples which | |
-- were assayed twice; to make sure that we have just one | |
-- value per sample per gene | |
AVG(unshared_logRatio) AS avgLogRatio | |
FROM | |
`isb-cgc.hg19_data_previews.TCGA_Breast_BI_Proteom_CDAP_r2_itraq` a | |
JOIN | |
qcSet b | |
ON | |
SUBSTR(a.sample_barcode,1,12)=b.case_barcode | |
GROUP BY | |
sample_barcode, | |
gene ), | |
-- | |
-- now we get the RPPA protein data, while filtering down to | |
-- the samples in the CPTAC set above ... | |
-- from this stage, we will wind up with 16551 rows, with data | |
-- for 74 samples, and 227 gene/protein pairs (176 unique genes, | |
-- and 214 proteins) -- some of the proteins are phospho-proteins | |
-- and in some cases multiple genes may map to a single | |
-- protein due to non-specific binding in the RPPA assay | |
j AS ( | |
SELECT | |
r.sample_barcode AS sample, | |
r.gene_name AS gene, | |
r.protein_name AS protein, | |
r.protein_expression AS rppaExp, | |
p.avgLogRatio AS iTRAQ_logRatio | |
FROM | |
`isb-cgc.TCGA_hg19_data_v0.Protein_Expression` r | |
JOIN | |
p | |
ON | |
SUBSTR(r.sample_barcode,1,15)=SUBSTR(p.sample_barcode,1,15) | |
AND r.gene_name=p.gene ), | |
-- | |
-- finally we compute a Pearson correlation on the two | |
-- expression values, and keep only those correlations | |
-- based on observations in at least 20 samples | |
-- (1/4 of the available data) | |
c AS ( | |
SELECT | |
CORR(iTRAQ_logRatio, | |
rppaExp) AS corrByGeneProt, | |
COUNT(*) AS n, | |
gene, | |
protein | |
FROM | |
j | |
GROUP BY | |
gene, | |
protein | |
HAVING | |
n>=20 ) | |
SELECT | |
* | |
FROM | |
c | |
ORDER BY | |
corrByGeneProt DESC |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
modified the JOIN to only require that the sample barcode's match over the first 15 characters so that the -01A sample and the -01B sample from a particular case will be matched up (otherwise we lose 3 of the 77 samples with CPTAC data)