Created
May 23, 2017 23:23
-
-
Save Gibbsdavidl/72941f4f58c740d603c266f04939dadd to your computer and use it in GitHub Desktop.
BigQuery_Linear_Regression_Library.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE TEMP FUNCTION | |
bq_linear(x ARRAY<FLOAT64>, y ARRAY<FLOAT64>) | |
RETURNS FLOAT64 | |
LANGUAGE js AS """ | |
// http://tom-alexander.github.io/regression-js/ | |
var data = []; | |
// Our data list is list of lists. The small list being each (x,y) point | |
for (var i = 0; i < x.length; i++) { | |
data.push([x[i],y[i]]) | |
} | |
var result = regression('linear', data) ; | |
var slope = result.equation[0]; | |
return(slope) | |
""" OPTIONS ( library=["gs://gibbs_bucket_nov162016/regression.js"] ); WITH | |
-- | |
-- First we get gene expression | |
-- | |
geneExpr_X AS ( | |
SELECT | |
sample_barcode, | |
LOG10(normalized_count+1) AS X | |
FROM | |
`isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM` | |
WHERE | |
project_short_name = 'TCGA-BRCA' | |
AND HGNC_gene_symbol = 'EGFR' | |
GROUP BY | |
sample_barcode, | |
normalized_count), | |
-- | |
-- Then a miRNA expression sub-table | |
-- | |
miExpr_Y AS ( | |
SELECT | |
sample_barcode, | |
LOG10(reads_per_million_miRNA_mapped+1) AS Y | |
FROM | |
`isb-cgc.TCGA_hg19_data_v0.miRNAseq_Expression` | |
WHERE | |
project_short_name = 'TCGA-BRCA' | |
AND mirna_id = 'hsa-mir-7-1' | |
GROUP BY | |
sample_barcode, | |
reads_per_million_miRNA_mapped | |
), | |
-- | |
-- Here we join the two previous tables | |
-- | |
dat AS ( | |
select | |
ARRAY_AGG(a.X) as ArrX, | |
ARRAY_AGG(b.Y) AS ArrY | |
from | |
geneExpr_X as a join miExpr_Y as b on a.sample_barcode = b.sample_barcode | |
) | |
-- | |
-- Now we call our UDF | |
-- | |
select bq_linear(ArrX, ArrY) from dat |
CREATE TEMP FUNCTION
bq_linear(x ARRAY,
y ARRAY)
RETURNS FLOAT64
LANGUAGE js AS """
// http://tom-alexander.github.io/regression-js/
var data = [];
// Our data list is list of lists. The small list being each (x,y) point
for (var i = 0; i < x.length; i++) {
data.push([x[i],y[i]])
}
var result = regression('linear', data) ;
var slope = result.equation[0];
return(slope)
""" OPTIONS ( library=["gs://gibbs_bucket_nov162016/regression.js"] ); WITH
-- First we get gene expression
geneExpr_X AS (
SELECT
project_short_name,
sample_barcode,
LOG10(normalized_count+1) AS X
FROM
isb-cgc.TCGA_hg19_data_v0.RNAseq_Gene_Expression_UNC_RSEM
WHERE
HGNC_gene_symbol = 'EGFR'
GROUP BY
project_short_name,
sample_barcode,
normalized_count),
-- Then a miRNA expression sub-table
miExpr_Y AS (
SELECT
project_short_name,
sample_barcode,
LOG10(reads_per_million_miRNA_mapped+1) AS Y
FROM
isb-cgc.TCGA_hg19_data_v0.miRNAseq_Expression
WHERE
mirna_id = 'hsa-mir-7-1'
GROUP BY
project_short_name,
sample_barcode,
reads_per_million_miRNA_mapped ),
-- Here we join the two previous tables
dat AS (
SELECT
a.project_short_name,
ARRAY_AGG(a.X) AS ArrX,
ARRAY_AGG(b.Y) AS ArrY
FROM
geneExpr_X AS a
JOIN
miExpr_Y AS b
ON
a.sample_barcode = b.sample_barcode
AND a.project_short_name = b.project_short_name
GROUP BY
project_short_name)
-- Now we call our UDF
SELECT
project_short_name,
bq_linear(ArrX, ArrY) as slope
FROM
dat
ORDER BY
slope
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
... Return an array of values ...
CREATE TEMP FUNCTION
bq_linear(x ARRAY, y ARRAY)
RETURNS ARRAY
LANGUAGE js AS """
// http://tom-alexander.github.io/regression-js/
"""