Skip to content

Instantly share code, notes, and snippets.

@smrgit
smrgit / cn-exp-corr-BRCA.bq
Last active November 18, 2016 19:07
Sample BigQuery SQL that correlates copy-number and gene-expression for TCGA samples in the BRCA cohort. There are two parts to this: the SQL and the UDF.
# paste this first part into the BigQuery "Query Editor" window
SELECT
gene,
chr,
CORR(avgCNsegMean,avglogExp) AS corr,
COUNT(*) AS n
FROM (
SELECT
annotCN.gene AS gene,
@smrgit
smrgit / gdc_toil_gexp.sql
Last active December 7, 2016 23:48
BigQuery standard SQL query for correlating GDC/HTSeq gene expression values with Toil/RSEM gene expression values.
/*
Copyright 2016, Institute for Systems Biology
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
@smrgit
smrgit / kMeans_in_BQ.sql
Last active October 23, 2021 14:43
kMeans using JavaScript UDFs in BigQuery
CREATE TEMPORARY FUNCTION
-- In this function, we're going to be working on arrays of values.
-- we're also going to define a set of functions 'inside' the kMeans.
-- *heavily borrowing from https://github.com/NathanEpstein/clusters* --
kMeans(x ARRAY<FLOAT64>, -- ESR1 gene expression
y ARRAY<FLOAT64>, -- EGFR gene expression
iterations FLOAT64, -- the number of iterations
@smrgit
smrgit / miRNA_hg19_hg38_isoform_corr.sql
Last active April 14, 2017 17:35
BQ GDC hg19 vs hg38 miRNA correlation analysis -- at the isoform expression level
WITH
hg38_d1 AS (
-- we start with a table at the aliquot level, in case there are multiple aliquots
-- for a single sample; the SUM() is to sum the isoforms since we're working
-- with the Isoform_Expression tables
SELECT
sample_barcode,
aliquot_barcode,
mirna_id,
mirna_accession,
@smrgit
smrgit / MIMAT0000082_hg19_vs_hg38
Created March 2, 2017 16:06
extract data for MIMAT0000082 from hg19 and hg38 tables
WITH
hg38_d1 AS (
-- we start with a table at the aliquot level, in case there are multiple aliquots
-- for a single sample; the SUM() is to sum the isoforms since we're working
-- with the Isoform_Expression tables
SELECT
sample_barcode,
aliquot_barcode,
mirna_id,
mirna_accession,
@smrgit
smrgit / BRCA_CPTAC_RNAseq_corr.sql
Last active March 22, 2017 16:38
BRCA CPTAC-RNAseq correlation
WITH
-- first we get the 77 samples that passed the QC tests
qcSet AS (
SELECT
TCGA_case_ID AS case_barcode
FROM
`isb-cgc.hg19_data_previews.TCGA_Breast_SuppTable01`
WHERE
QC_Status="pass" ),
--
@smrgit
smrgit / BRCA_CPTAC_RPPA_corr.sql
Last active March 22, 2017 15:42
BRCA CPTAC-RPPA correlation
WITH
-- first we get the 77 samples that passed the QC tests
qcSet AS (
SELECT
TCGA_case_ID AS case_barcode
FROM
`isb-cgc.hg19_data_previews.TCGA_Breast_SuppTable01`
WHERE
QC_Status="pass" ),
--
@smrgit
smrgit / miRNA_hg19_hg38_stemloop_corr.sql
Last active April 14, 2017 17:44
GQ GDC hg19 vs hg38 miRNA correlation analysis -- at the stemloop expression level
WITH
hg38_d1 AS (
-- we start with a table at the aliquot level, in case there are multiple aliquots
-- for a single sample;
SELECT
sample_barcode,
aliquot_barcode,
mirna_id,
reads_per_million_miRNA_mapped AS RPM
FROM
@smrgit
smrgit / GO_scoring_v0.sql
Created March 22, 2017 23:51
GO_scoring_v0
WITH
--
-- we start by translating the correlations that we got to ranks,
-- based on sorting the genes on corrByGene "DESC"
-- this will result in the highest positive correlation getting
-- rank #1, etc
-- we also lightly filter the genes by excluding any with near-zero
-- or negative correlation coefficients, and the result is a list
-- of approx 9000 genes with symbol, correlation, and rank
geneScoresT AS (
@smrgit
smrgit / bcgsc_gdc_delta_isoforms.sql
Created March 24, 2017 03:46
join BCGSC and GDC hg38 miRNA isoform-level data and compute read_count deltas
WITH
aList AS (
SELECT
aliquot_barcode AS abarcode
FROM
`isb-cgc-04-0010.draft_new_data.bcgsc_hg38_isoforms`
GROUP BY
abarcode ),
gdcData AS (
SELECT