Last active
November 18, 2016 19:07
-
-
Save smrgit/7d729ec783f8d8ed18545cb6028eafdf to your computer and use it in GitHub Desktop.
Sample BigQuery SQL that correlates copy-number and gene-expression for TCGA samples in the BRCA cohort. There are two parts to this: the SQL and the UDF.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# paste this first part into the BigQuery "Query Editor" window | |
SELECT | |
gene, | |
chr, | |
CORR(avgCNsegMean,avglogExp) AS corr, | |
COUNT(*) AS n | |
FROM ( | |
SELECT | |
annotCN.gene AS gene, | |
annotCN.chr AS chr, | |
annotCN.SampleBarcode AS SampleBarcode, | |
AVG(annotCN.CNsegMean) AS avgCNsegMean, | |
AVG(exp.logExp) AS avgLogExp | |
FROM ( | |
SELECT | |
geneInfo.gene AS gene, | |
geneInfo.chr AS chr, | |
geneInfo.region_start AS gene_start, | |
geneInfo.region_end AS gene_end, | |
geneInfo.bin AS bin, | |
cnInfo.SampleBarcode AS SampleBarcode, | |
cnInfo.Segment_Mean AS CNsegMean, | |
cnInfo.region_start AS cn_start, | |
cnInfo.region_end AS cn_end | |
FROM ( | |
SELECT | |
label AS gene, | |
chr, | |
region_start, | |
region_end, | |
bin | |
FROM ( binIntervals ( | |
SELECT | |
gene_name AS label, | |
FLOAT(start) AS value, | |
LTRIM(seq_name,"chr") AS chr, | |
start AS region_start, | |
END AS region_end | |
FROM | |
[isb-cgc:genome_reference.GENCODE_v19] | |
WHERE | |
feature="gene" | |
AND gene_status="KNOWN" ) ) ) AS geneInfo | |
JOIN EACH ( | |
SELECT | |
label AS SampleBarcode, | |
value AS Segment_Mean, | |
chr, | |
region_start, | |
region_end, | |
bin | |
FROM ( binIntervals ( | |
SELECT | |
SampleBarcode AS label, | |
Segment_Mean AS value, | |
Chromosome AS chr, | |
start AS region_start, | |
END AS region_end | |
FROM | |
[isb-cgc:tcga_201607_beta.Copy_Number_segments] | |
WHERE | |
SampleBarcode IN ( | |
SELECT | |
SampleBarcode | |
FROM | |
[isb-cgc:tcga_cohorts.BRCA] ) ) ) ) AS cnInfo | |
ON | |
( geneInfo.chr = cnInfo.chr ) | |
AND ( geneInfo.bin = cnInfo.bin ) ) AS annotCN | |
JOIN EACH ( | |
SELECT | |
SampleBarcode, | |
HGNC_gene_symbol, | |
LOG2(normalized_count+1) AS logExp | |
FROM | |
[isb-cgc:tcga_201607_beta.mRNA_UNC_HiSeq_RSEM] | |
WHERE | |
SampleBarcode IN ( | |
SELECT | |
SampleBarcode | |
FROM | |
[isb-cgc:tcga_cohorts.BRCA] ) ) AS exp | |
ON | |
( exp.HGNC_gene_symbol = annotCN.gene ) | |
AND ( exp.SampleBarcode = annotCN.SampleBarcode ) | |
GROUP BY | |
gene, | |
chr, | |
SampleBarcode ) | |
GROUP BY | |
gene, | |
chr | |
HAVING | |
corr IS NOT NULL | |
ORDER BY | |
corr DESC | |
## and paste this next bit into the "UDF Editor" window | |
function binIntervals(row, emit) { | |
var binSize = 10000; // Make sure this matches the value in the SQL (if necessary) | |
var startBin = Math.floor(row.region_start / binSize); | |
var endBin = Math.floor(row.region_end / binSize); | |
// Since an interval can span multiple bins, emit | |
// a record for each bin it spans. | |
for(var bin = startBin; bin <= endBin; bin++) { | |
emit({label: row.label, | |
value: row.value, | |
chr: row.chr, | |
region_start: row.region_start, | |
region_end: row.region_end, | |
bin: bin, | |
}); | |
} | |
} | |
bigquery.defineFunction( | |
'binIntervals', // Name of the function exported to SQL | |
['label', 'value', 'chr', 'region_start', 'region_end'], // Names of input columns | |
[{'name': 'label', 'type': 'string'}, // Output schema | |
{'name': 'value', 'type': 'float'}, | |
{'name': 'chr', 'type': 'string'}, | |
{'name': 'region_start', 'type': 'integer'}, | |
{'name': 'region_end', 'type': 'integer'}, | |
{'name': 'bin', 'type': 'integer'}], | |
binIntervals // Reference to JavaScript UDF | |
); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment