Last active
April 14, 2017 17:35
-
-
Save smrgit/d7ecbaddb41341c7835e397737e5d4d9 to your computer and use it in GitHub Desktop.
BQ GDC hg19 vs hg38 miRNA correlation analysis -- at the isoform expression level
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
WITH | |
hg38_d1 AS ( | |
-- we start with a table at the aliquot level, in case there are multiple aliquots | |
-- for a single sample; the SUM() is to sum the isoforms since we're working | |
-- with the Isoform_Expression tables | |
SELECT | |
sample_barcode, | |
aliquot_barcode, | |
mirna_id, | |
mirna_accession, | |
mirna_transcript, | |
SUM(reads_per_million_miRNA_mapped) AS sumRPM | |
FROM | |
`isb-cgc.TCGA_hg38_data_v0.miRNAseq_Isoform_Expression` | |
WHERE | |
mirna_accession IS NOT NULL | |
GROUP BY | |
sample_barcode, | |
aliquot_barcode, | |
mirna_id, | |
mirna_accession, | |
mirna_transcript ), | |
hg38_d2 AS ( | |
-- next we do a MAX() over the sumRPM values from the aliquots and just | |
-- keep the sample_barcode; and we're also going to do a LOG() | |
SELECT | |
sample_barcode, | |
mirna_id, | |
mirna_accession, | |
mirna_transcript, | |
LOG(MAX(sumRPM)) AS logRPM | |
FROM | |
hg38_d1 | |
GROUP BY | |
sample_barcode, | |
mirna_id, | |
mirna_accession, | |
mirna_transcript ), | |
hg19_d1 AS ( | |
-- now we're going to do the same with the hg19 data ... | |
SELECT | |
sample_barcode, | |
aliquot_barcode, | |
mirna_id, | |
mirna_accession, | |
mirna_transcript, | |
SUM(reads_per_million_miRNA_mapped) AS sumRPM | |
FROM | |
`isb-cgc.TCGA_hg19_data_v0.miRNAseq_Isoform_Expression` | |
WHERE | |
mirna_accession IS NOT NULL | |
GROUP BY | |
sample_barcode, | |
aliquot_barcode, | |
mirna_id, | |
mirna_accession, | |
mirna_transcript ), | |
hg19_d2 AS ( | |
SELECT | |
sample_barcode, | |
mirna_id, | |
mirna_accession, | |
mirna_transcript, | |
LOG(MAX(sumRPM)) AS logRPM | |
FROM | |
hg19_d1 | |
GROUP BY | |
sample_barcode, | |
mirna_id, | |
mirna_accession, | |
mirna_transcript ), | |
j1 AS ( | |
SELECT | |
a.logRPM AS hg19_logRPM, | |
b.logRPM AS hg38_logRPM, | |
b.sample_barcode AS sample_barcode, | |
b.mirna_id AS mirna_id, | |
b.mirna_accession AS mirna_accession, | |
b.mirna_transcript AS mirna_transcript | |
FROM | |
hg19_d2 a | |
JOIN | |
hg38_d2 b | |
ON | |
a.sample_barcode=b.sample_barcode | |
AND a.mirna_id=b.mirna_id | |
AND a.mirna_accession=b.mirna_accession | |
AND a.mirna_transcript=b.mirna_transcript ) | |
SELECT | |
mirna_id, | |
mirna_accession, | |
mirna_transcript, | |
CORR(hg19_logRPM, | |
hg38_logRPM) AS corr, | |
COUNT(*) AS n | |
FROM | |
j1 | |
GROUP BY | |
mirna_id, | |
mirna_accession, | |
mirna_transcript | |
HAVING | |
n >= 1000 | |
ORDER BY | |
corr DESC |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
updated to use new ISB-CGC tables