Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save Gibbsdavidl/72941f4f58c740d603c266f04939dadd to your computer and use it in GitHub Desktop.
Save Gibbsdavidl/72941f4f58c740d603c266f04939dadd to your computer and use it in GitHub Desktop.
BigQuery_Linear_Regression_Library.sql
CREATE TEMP FUNCTION
bq_linear(x ARRAY<FLOAT64>, y ARRAY<FLOAT64>)
RETURNS FLOAT64
LANGUAGE js AS """
// http://tom-alexander.github.io/regression-js/
var data = [];
// Our data list is list of lists. The small list being each (x,y) point
for (var i = 0; i < x.length; i++) {
data.push([x[i],y[i]])
}
var result = regression('linear', data) ;
var slope = result.equation[0];
return(slope)
""" OPTIONS ( library=["gs://gibbs_bucket_nov162016/regression.js"] ); WITH
--
-- First we get gene expression
--
geneExpr_X AS (
SELECT
sample_barcode,
LOG10(normalized_count+1) AS X
FROM
`isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM`
WHERE
project_short_name = 'TCGA-BRCA'
AND HGNC_gene_symbol = 'EGFR'
GROUP BY
sample_barcode,
normalized_count),
--
-- Then a miRNA expression sub-table
--
miExpr_Y AS (
SELECT
sample_barcode,
LOG10(reads_per_million_miRNA_mapped+1) AS Y
FROM
`isb-cgc.TCGA_hg19_data_v0.miRNAseq_Expression`
WHERE
project_short_name = 'TCGA-BRCA'
AND mirna_id = 'hsa-mir-7-1'
GROUP BY
sample_barcode,
reads_per_million_miRNA_mapped
),
--
-- Here we join the two previous tables
--
dat AS (
select
ARRAY_AGG(a.X) as ArrX,
ARRAY_AGG(b.Y) AS ArrY
from
geneExpr_X as a join miExpr_Y as b on a.sample_barcode = b.sample_barcode
)
--
-- Now we call our UDF
--
select bq_linear(ArrX, ArrY) from dat
@Gibbsdavidl
Copy link
Author

... Return an array of values ...

CREATE TEMP FUNCTION
bq_linear(x ARRAY, y ARRAY)
RETURNS ARRAY
LANGUAGE js AS """
// http://tom-alexander.github.io/regression-js/

var data = [];
// Our data list is list of lists. The small list being each (x,y) point
for (var i = 0; i < x.length; i++) {
  data.push([x[i],y[i]])
}

var result = regression('linear', data) ;
var slope = result.equation[0];
var intercept = result.equation[1];
return( [slope, intercept] )

"""

@Gibbsdavidl
Copy link
Author

CREATE TEMP FUNCTION
bq_linear(x ARRAY,
y ARRAY)
RETURNS FLOAT64
LANGUAGE js AS """
// http://tom-alexander.github.io/regression-js/

var data = [];
// Our data list is list of lists. The small list being each (x,y) point
for (var i = 0; i < x.length; i++) {
  data.push([x[i],y[i]])
}

var result = regression('linear', data) ;
var slope = result.equation[0];
return(slope)

""" OPTIONS ( library=["gs://gibbs_bucket_nov162016/regression.js"] ); WITH

-- First we get gene expression

geneExpr_X AS (
SELECT
project_short_name,
sample_barcode,
LOG10(normalized_count+1) AS X
FROM
isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM
WHERE
HGNC_gene_symbol = 'EGFR'
GROUP BY
project_short_name,
sample_barcode,
normalized_count),

-- Then a miRNA expression sub-table

miExpr_Y AS (
SELECT
project_short_name,
sample_barcode,
LOG10(reads_per_million_miRNA_mapped+1) AS Y
FROM
isb-cgc.TCGA_hg19_data_v0.miRNAseq_Expression
WHERE
mirna_id = 'hsa-mir-7-1'
GROUP BY
project_short_name,
sample_barcode,
reads_per_million_miRNA_mapped ),

-- Here we join the two previous tables

dat AS (
SELECT
a.project_short_name,
ARRAY_AGG(a.X) AS ArrX,
ARRAY_AGG(b.Y) AS ArrY
FROM
geneExpr_X AS a
JOIN
miExpr_Y AS b
ON
a.sample_barcode = b.sample_barcode
AND a.project_short_name = b.project_short_name
GROUP BY
project_short_name)

-- Now we call our UDF

SELECT
project_short_name,
bq_linear(ArrX, ArrY) as slope
FROM
dat
ORDER BY
slope

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment