Sheila smrgit

## cn-exp-corr-BRCA.bq
# paste this first part into the BigQuery "Query Editor" window

SELECT
  gene,
  chr,
  CORR(avgCNsegMean,avglogExp) AS corr,
  COUNT(*) AS n
FROM (
  SELECT
    annotCN.gene AS gene,

## gdc_toil_gexp.sql
/*

Copyright 2016, Institute for Systems Biology

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

## kMeans_in_BQ.sql
  CREATE TEMPORARY FUNCTION

    -- In this function, we're going to be working on arrays of values.
    -- we're also going to define a set of functions 'inside' the kMeans.

    -- *heavily borrowing from https://github.com/NathanEpstein/clusters* --

    kMeans(x ARRAY<FLOAT64>,  -- ESR1 gene expression
           y ARRAY<FLOAT64>,  -- EGFR gene expression
           iterations FLOAT64,  -- the number of iterations

## miRNA_hg19_hg38_isoform_corr.sql
WITH
  hg38_d1 AS (
    -- we start with a table at the aliquot level, in case there are multiple aliquots
    -- for a single sample; the SUM() is to sum the isoforms since we're working
    -- with the Isoform_Expression tables
  SELECT
    sample_barcode,
    aliquot_barcode,
    mirna_id,
    mirna_accession,

## MIMAT0000082_hg19_vs_hg38
WITH
  hg38_d1 AS (
    -- we start with a table at the aliquot level, in case there are multiple aliquots
    -- for a single sample; the SUM() is to sum the isoforms since we're working
    -- with the Isoform_Expression tables
  SELECT
    sample_barcode,
    aliquot_barcode,
    mirna_id,
    mirna_accession,

## BRCA_CPTAC_RNAseq_corr.sql
WITH
  -- first we get the 77 samples that passed the QC tests
  qcSet AS (
  SELECT
    TCGA_case_ID AS case_barcode
  FROM
    `isb-cgc.hg19_data_previews.TCGA_Breast_SuppTable01`
  WHERE
    QC_Status="pass" ),
  --

## BRCA_CPTAC_RPPA_corr.sql
WITH
  -- first we get the 77 samples that passed the QC tests
  qcSet AS (
  SELECT
    TCGA_case_ID AS case_barcode
  FROM
    `isb-cgc.hg19_data_previews.TCGA_Breast_SuppTable01`
  WHERE
    QC_Status="pass" ),
  --

## miRNA_hg19_hg38_stemloop_corr.sql
WITH
  hg38_d1 AS (
    -- we start with a table at the aliquot level, in case there are multiple aliquots
    -- for a single sample;
  SELECT
    sample_barcode,
    aliquot_barcode,
    mirna_id,
    reads_per_million_miRNA_mapped AS RPM
  FROM

## GO_scoring_v0.sql
WITH
  --
  -- we start by translating the correlations that we got to ranks,
  -- based on sorting the genes on corrByGene "DESC"
  -- this will result in the highest positive correlation getting
  -- rank #1, etc
  -- we also lightly filter the genes by excluding any with near-zero
  -- or negative correlation coefficients, and the result is a list
  -- of approx 9000 genes with symbol, correlation, and rank
  geneScoresT AS (

## bcgsc_gdc_delta_isoforms.sql
WITH
  aList AS (
  SELECT
    aliquot_barcode AS abarcode
  FROM
    `isb-cgc-04-0010.draft_new_data.bcgsc_hg38_isoforms`
  GROUP BY
    abarcode ),
  gdcData AS (
  SELECT
	# paste this first part into the BigQuery "Query Editor" window

	SELECT
	gene,
	chr,
	CORR(avgCNsegMean,avglogExp) AS corr,
	COUNT(*) AS n
	FROM (
	SELECT
	annotCN.gene AS gene,
	/*

	Copyright 2016, Institute for Systems Biology

	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0
	CREATE TEMPORARY FUNCTION

	-- In this function, we're going to be working on arrays of values.
	-- we're also going to define a set of functions 'inside' the kMeans.

	-- heavily borrowing from https://github.com/NathanEpstein/clusters --

	kMeans(x ARRAY<FLOAT64>, -- ESR1 gene expression
	y ARRAY<FLOAT64>, -- EGFR gene expression
	iterations FLOAT64, -- the number of iterations
	WITH
	hg38_d1 AS (
	-- we start with a table at the aliquot level, in case there are multiple aliquots
	-- for a single sample; the SUM() is to sum the isoforms since we're working
	-- with the Isoform_Expression tables
	SELECT
	sample_barcode,
	aliquot_barcode,
	mirna_id,
	mirna_accession,
	WITH
	-- first we get the 77 samples that passed the QC tests
	qcSet AS (
	SELECT
	TCGA_case_ID AS case_barcode
	FROM
	`isb-cgc.hg19_data_previews.TCGA_Breast_SuppTable01`
	WHERE
	QC_Status="pass" ),
	--
	WITH
	--
	-- we start by translating the correlations that we got to ranks,
	-- based on sorting the genes on corrByGene "DESC"
	-- this will result in the highest positive correlation getting
	-- rank #1, etc
	-- we also lightly filter the genes by excluding any with near-zero
	-- or negative correlation coefficients, and the result is a list
	-- of approx 9000 genes with symbol, correlation, and rank
	geneScoresT AS (
	WITH
	aList AS (
	SELECT
	aliquot_barcode AS abarcode
	FROM
	`isb-cgc-04-0010.draft_new_data.bcgsc_hg38_isoforms`
	GROUP BY
	abarcode ),
	gdcData AS (
	SELECT