Skip to content

Instantly share code, notes, and snippets.

@smrgit
Last active November 18, 2016 19:07
Show Gist options
  • Save smrgit/7d729ec783f8d8ed18545cb6028eafdf to your computer and use it in GitHub Desktop.
Save smrgit/7d729ec783f8d8ed18545cb6028eafdf to your computer and use it in GitHub Desktop.
Sample BigQuery SQL that correlates copy-number and gene-expression for TCGA samples in the BRCA cohort. There are two parts to this: the SQL and the UDF.
# paste this first part into the BigQuery "Query Editor" window
SELECT
gene,
chr,
CORR(avgCNsegMean,avglogExp) AS corr,
COUNT(*) AS n
FROM (
SELECT
annotCN.gene AS gene,
annotCN.chr AS chr,
annotCN.SampleBarcode AS SampleBarcode,
AVG(annotCN.CNsegMean) AS avgCNsegMean,
AVG(exp.logExp) AS avgLogExp
FROM (
SELECT
geneInfo.gene AS gene,
geneInfo.chr AS chr,
geneInfo.region_start AS gene_start,
geneInfo.region_end AS gene_end,
geneInfo.bin AS bin,
cnInfo.SampleBarcode AS SampleBarcode,
cnInfo.Segment_Mean AS CNsegMean,
cnInfo.region_start AS cn_start,
cnInfo.region_end AS cn_end
FROM (
SELECT
label AS gene,
chr,
region_start,
region_end,
bin
FROM ( binIntervals (
SELECT
gene_name AS label,
FLOAT(start) AS value,
LTRIM(seq_name,"chr") AS chr,
start AS region_start,
END AS region_end
FROM
[isb-cgc:genome_reference.GENCODE_v19]
WHERE
feature="gene"
AND gene_status="KNOWN" ) ) ) AS geneInfo
JOIN EACH (
SELECT
label AS SampleBarcode,
value AS Segment_Mean,
chr,
region_start,
region_end,
bin
FROM ( binIntervals (
SELECT
SampleBarcode AS label,
Segment_Mean AS value,
Chromosome AS chr,
start AS region_start,
END AS region_end
FROM
[isb-cgc:tcga_201607_beta.Copy_Number_segments]
WHERE
SampleBarcode IN (
SELECT
SampleBarcode
FROM
[isb-cgc:tcga_cohorts.BRCA] ) ) ) ) AS cnInfo
ON
( geneInfo.chr = cnInfo.chr )
AND ( geneInfo.bin = cnInfo.bin ) ) AS annotCN
JOIN EACH (
SELECT
SampleBarcode,
HGNC_gene_symbol,
LOG2(normalized_count+1) AS logExp
FROM
[isb-cgc:tcga_201607_beta.mRNA_UNC_HiSeq_RSEM]
WHERE
SampleBarcode IN (
SELECT
SampleBarcode
FROM
[isb-cgc:tcga_cohorts.BRCA] ) ) AS exp
ON
( exp.HGNC_gene_symbol = annotCN.gene )
AND ( exp.SampleBarcode = annotCN.SampleBarcode )
GROUP BY
gene,
chr,
SampleBarcode )
GROUP BY
gene,
chr
HAVING
corr IS NOT NULL
ORDER BY
corr DESC
## and paste this next bit into the "UDF Editor" window
function binIntervals(row, emit) {
var binSize = 10000; // Make sure this matches the value in the SQL (if necessary)
var startBin = Math.floor(row.region_start / binSize);
var endBin = Math.floor(row.region_end / binSize);
// Since an interval can span multiple bins, emit
// a record for each bin it spans.
for(var bin = startBin; bin <= endBin; bin++) {
emit({label: row.label,
value: row.value,
chr: row.chr,
region_start: row.region_start,
region_end: row.region_end,
bin: bin,
});
}
}
bigquery.defineFunction(
'binIntervals', // Name of the function exported to SQL
['label', 'value', 'chr', 'region_start', 'region_end'], // Names of input columns
[{'name': 'label', 'type': 'string'}, // Output schema
{'name': 'value', 'type': 'float'},
{'name': 'chr', 'type': 'string'},
{'name': 'region_start', 'type': 'integer'},
{'name': 'region_end', 'type': 'integer'},
{'name': 'bin', 'type': 'integer'}],
binIntervals // Reference to JavaScript UDF
);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment