smrgit/cn-exp-corr-BRCA.bq

## cn-exp-corr-BRCA.bq
# paste this first part into the BigQuery "Query Editor" window

SELECT
  gene,
  chr,
  CORR(avgCNsegMean,avglogExp) AS corr,
  COUNT(*) AS n
FROM (
  SELECT
    annotCN.gene AS gene,
    annotCN.chr AS chr,
    annotCN.SampleBarcode AS SampleBarcode,
    AVG(annotCN.CNsegMean) AS avgCNsegMean,
    AVG(exp.logExp) AS avgLogExp
  FROM (
    SELECT
      geneInfo.gene AS gene,
      geneInfo.chr AS chr,
      geneInfo.region_start AS gene_start,
      geneInfo.region_end AS gene_end,
      geneInfo.bin AS bin,
      cnInfo.SampleBarcode AS SampleBarcode,
      cnInfo.Segment_Mean AS CNsegMean,
      cnInfo.region_start AS cn_start,
      cnInfo.region_end AS cn_end
    FROM (
      SELECT
        label AS gene,
        chr,
        region_start,
        region_end,
        bin
      FROM ( binIntervals (
          SELECT
            gene_name AS label,
            FLOAT(start) AS value,
            LTRIM(seq_name,"chr") AS chr,
            start AS region_start,
            END AS region_end
          FROM
            [isb-cgc:genome_reference.GENCODE_v19]
          WHERE
            feature="gene"
            AND gene_status="KNOWN" ) ) ) AS geneInfo
    JOIN EACH (
      SELECT
        label AS SampleBarcode,
        value AS Segment_Mean,
        chr,
        region_start,
        region_end,
        bin
      FROM ( binIntervals (
          SELECT
            SampleBarcode AS label,
            Segment_Mean AS value,
            Chromosome AS chr,
            start AS region_start,
            END AS region_end
          FROM
            [isb-cgc:tcga_201607_beta.Copy_Number_segments]
          WHERE
            SampleBarcode IN (
            SELECT
              SampleBarcode
            FROM
              [isb-cgc:tcga_cohorts.BRCA] ) ) ) ) AS cnInfo
    ON
      ( geneInfo.chr = cnInfo.chr )
      AND ( geneInfo.bin = cnInfo.bin ) ) AS annotCN
  JOIN EACH (
    SELECT
      SampleBarcode,
      HGNC_gene_symbol,
      LOG2(normalized_count+1) AS logExp
    FROM
      [isb-cgc:tcga_201607_beta.mRNA_UNC_HiSeq_RSEM]
    WHERE
      SampleBarcode IN (
      SELECT
        SampleBarcode
      FROM
        [isb-cgc:tcga_cohorts.BRCA] ) ) AS exp
  ON
    ( exp.HGNC_gene_symbol = annotCN.gene )
    AND ( exp.SampleBarcode = annotCN.SampleBarcode )
  GROUP BY
    gene,
    chr,
    SampleBarcode )
GROUP BY
  gene,
  chr
HAVING
  corr IS NOT NULL
ORDER BY
  corr DESC

## and paste this next bit into the "UDF Editor" window

function binIntervals(row, emit) {
  var binSize = 10000;  // Make sure this matches the value in the SQL (if necessary)
  var startBin = Math.floor(row.region_start / binSize);
  var endBin = Math.floor(row.region_end / binSize);
  // Since an interval can span multiple bins, emit
  // a record for each bin it spans.
  for(var bin = startBin; bin <= endBin; bin++) {
    emit({label: row.label,
          value: row.value,
          chr: row.chr,
          region_start: row.region_start,
          region_end: row.region_end,
          bin: bin,
         });
  }
}

bigquery.defineFunction(
  'binIntervals',                                // Name of the function exported to SQL
  ['label', 'value', 'chr', 'region_start', 'region_end'], // Names of input columns
  [{'name': 'label', 'type': 'string'},           // Output schema
   {'name': 'value', 'type': 'float'},
   {'name': 'chr',   'type': 'string'},
   {'name': 'region_start', 'type': 'integer'},
   {'name': 'region_end',   'type': 'integer'},
   {'name': 'bin',   'type': 'integer'}],
  binIntervals                                   // Reference to JavaScript UDF
);
	# paste this first part into the BigQuery "Query Editor" window

	SELECT
	gene,
	chr,
	CORR(avgCNsegMean,avglogExp) AS corr,
	COUNT(*) AS n
	FROM (
	SELECT
	annotCN.gene AS gene,
	annotCN.chr AS chr,
	annotCN.SampleBarcode AS SampleBarcode,
	AVG(annotCN.CNsegMean) AS avgCNsegMean,
	AVG(exp.logExp) AS avgLogExp
	FROM (
	SELECT
	geneInfo.gene AS gene,
	geneInfo.chr AS chr,
	geneInfo.region_start AS gene_start,
	geneInfo.region_end AS gene_end,
	geneInfo.bin AS bin,
	cnInfo.SampleBarcode AS SampleBarcode,
	cnInfo.Segment_Mean AS CNsegMean,
	cnInfo.region_start AS cn_start,
	cnInfo.region_end AS cn_end
	FROM (
	SELECT
	label AS gene,
	chr,
	region_start,
	region_end,
	bin
	FROM ( binIntervals (
	SELECT
	gene_name AS label,
	FLOAT(start) AS value,
	LTRIM(seq_name,"chr") AS chr,
	start AS region_start,
	END AS region_end
	FROM
	[isb-cgc:genome_reference.GENCODE_v19]
	WHERE
	feature="gene"
	AND gene_status="KNOWN" ) ) ) AS geneInfo
	JOIN EACH (
	SELECT
	label AS SampleBarcode,
	value AS Segment_Mean,
	chr,
	region_start,
	region_end,
	bin
	FROM ( binIntervals (
	SELECT
	SampleBarcode AS label,
	Segment_Mean AS value,
	Chromosome AS chr,
	start AS region_start,
	END AS region_end
	FROM
	[isb-cgc:tcga_201607_beta.Copy_Number_segments]
	WHERE
	SampleBarcode IN (
	SELECT
	SampleBarcode
	FROM
	[isb-cgc:tcga_cohorts.BRCA] ) ) ) ) AS cnInfo
	ON
	( geneInfo.chr = cnInfo.chr )
	AND ( geneInfo.bin = cnInfo.bin ) ) AS annotCN
	JOIN EACH (
	SELECT
	SampleBarcode,
	HGNC_gene_symbol,
	LOG2(normalized_count+1) AS logExp
	FROM
	[isb-cgc:tcga_201607_beta.mRNA_UNC_HiSeq_RSEM]
	WHERE
	SampleBarcode IN (
	SELECT
	SampleBarcode
	FROM
	[isb-cgc:tcga_cohorts.BRCA] ) ) AS exp
	ON
	( exp.HGNC_gene_symbol = annotCN.gene )
	AND ( exp.SampleBarcode = annotCN.SampleBarcode )
	GROUP BY
	gene,
	chr,
	SampleBarcode )
	GROUP BY
	gene,
	chr
	HAVING
	corr IS NOT NULL
	ORDER BY
	corr DESC

	## and paste this next bit into the "UDF Editor" window

	function binIntervals(row, emit) {
	var binSize = 10000; // Make sure this matches the value in the SQL (if necessary)
	var startBin = Math.floor(row.region_start / binSize);
	var endBin = Math.floor(row.region_end / binSize);
	// Since an interval can span multiple bins, emit
	// a record for each bin it spans.
	for(var bin = startBin; bin <= endBin; bin++) {
	emit({label: row.label,
	value: row.value,
	chr: row.chr,
	region_start: row.region_start,
	region_end: row.region_end,
	bin: bin,
	});
	}
	}

	bigquery.defineFunction(
	'binIntervals', // Name of the function exported to SQL
	['label', 'value', 'chr', 'region_start', 'region_end'], // Names of input columns
	[{'name': 'label', 'type': 'string'}, // Output schema
	{'name': 'value', 'type': 'float'},
	{'name': 'chr', 'type': 'string'},
	{'name': 'region_start', 'type': 'integer'},
	{'name': 'region_end', 'type': 'integer'},
	{'name': 'bin', 'type': 'integer'}],
	binIntervals // Reference to JavaScript UDF
	);