Skip to content

Instantly share code, notes, and snippets.

@smrgit
Last active April 14, 2017 17:35
Show Gist options
  • Save smrgit/d7ecbaddb41341c7835e397737e5d4d9 to your computer and use it in GitHub Desktop.
Save smrgit/d7ecbaddb41341c7835e397737e5d4d9 to your computer and use it in GitHub Desktop.
BQ GDC hg19 vs hg38 miRNA correlation analysis -- at the isoform expression level
WITH
hg38_d1 AS (
-- we start with a table at the aliquot level, in case there are multiple aliquots
-- for a single sample; the SUM() is to sum the isoforms since we're working
-- with the Isoform_Expression tables
SELECT
sample_barcode,
aliquot_barcode,
mirna_id,
mirna_accession,
mirna_transcript,
SUM(reads_per_million_miRNA_mapped) AS sumRPM
FROM
`isb-cgc.TCGA_hg38_data_v0.miRNAseq_Isoform_Expression`
WHERE
mirna_accession IS NOT NULL
GROUP BY
sample_barcode,
aliquot_barcode,
mirna_id,
mirna_accession,
mirna_transcript ),
hg38_d2 AS (
-- next we do a MAX() over the sumRPM values from the aliquots and just
-- keep the sample_barcode; and we're also going to do a LOG()
SELECT
sample_barcode,
mirna_id,
mirna_accession,
mirna_transcript,
LOG(MAX(sumRPM)) AS logRPM
FROM
hg38_d1
GROUP BY
sample_barcode,
mirna_id,
mirna_accession,
mirna_transcript ),
hg19_d1 AS (
-- now we're going to do the same with the hg19 data ...
SELECT
sample_barcode,
aliquot_barcode,
mirna_id,
mirna_accession,
mirna_transcript,
SUM(reads_per_million_miRNA_mapped) AS sumRPM
FROM
`isb-cgc.TCGA_hg19_data_v0.miRNAseq_Isoform_Expression`
WHERE
mirna_accession IS NOT NULL
GROUP BY
sample_barcode,
aliquot_barcode,
mirna_id,
mirna_accession,
mirna_transcript ),
hg19_d2 AS (
SELECT
sample_barcode,
mirna_id,
mirna_accession,
mirna_transcript,
LOG(MAX(sumRPM)) AS logRPM
FROM
hg19_d1
GROUP BY
sample_barcode,
mirna_id,
mirna_accession,
mirna_transcript ),
j1 AS (
SELECT
a.logRPM AS hg19_logRPM,
b.logRPM AS hg38_logRPM,
b.sample_barcode AS sample_barcode,
b.mirna_id AS mirna_id,
b.mirna_accession AS mirna_accession,
b.mirna_transcript AS mirna_transcript
FROM
hg19_d2 a
JOIN
hg38_d2 b
ON
a.sample_barcode=b.sample_barcode
AND a.mirna_id=b.mirna_id
AND a.mirna_accession=b.mirna_accession
AND a.mirna_transcript=b.mirna_transcript )
SELECT
mirna_id,
mirna_accession,
mirna_transcript,
CORR(hg19_logRPM,
hg38_logRPM) AS corr,
COUNT(*) AS n
FROM
j1
GROUP BY
mirna_id,
mirna_accession,
mirna_transcript
HAVING
n >= 1000
ORDER BY
corr DESC
@smrgit
Copy link
Author

smrgit commented Apr 14, 2017

updated to use new ISB-CGC tables

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment