Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
kMeans using JavaScript UDFs in BigQuery
CREATE TEMPORARY FUNCTION
-- In this function, we're going to be working on arrays of values.
-- we're also going to define a set of functions 'inside' the kMeans.
-- *heavily borrowing from https://github.com/NathanEpstein/clusters* --
kMeans(x ARRAY<FLOAT64>, -- ESR1 gene expression
y ARRAY<FLOAT64>, -- EGFR gene expression
iterations FLOAT64, -- the number of iterations
k FLOAT64) -- the number of clusters
RETURNS ARRAY<FLOAT64> -- returns the cluster assignments
LANGUAGE js AS """
'use strict'
function sumOfSquareDiffs(oneVector, anotherVector) {
// the sum of squares error //
var squareDiffs = oneVector.map(function(component, i) {
return Math.pow(component - anotherVector[i], 2);
});
return squareDiffs.reduce(function(a, b) { return a + b }, 0);
};
function mindex(array) {
// returns the index to the minimum value in the array
var min = array.reduce(function(a, b) {
return Math.min(a, b);
});
return array.indexOf(min);
};
function sumVectors(a, b) {
// The map function gets used frequently in JavaScript
return a.map(function(val, i) { return val + b[i] });
};
function averageLocation(points) {
// Take all the points assigned to a cluster
// and find the averge center point.
// This gets used to update the cluster centroids.
var zeroVector = points[0].location.map(function() { return 0 });
var locations = points.map(function(point) { return point.location });
var vectorSum = locations.reduce(function(a, b) { return sumVectors(a, b) }, zeroVector);
return vectorSum.map(function(val) { return val / points.length });
};
function Point(location) {
// A point object, each sample is represented as a point //
var self = this;
this.location = location;
this.label = 1;
this.updateLabel = function(centroids) {
var distancesSquared = centroids.map(function(centroid) {
return sumOfSquareDiffs(self.location, centroid.location);
});
self.label = mindex(distancesSquared);
};
};
function Centroid(initialLocation, label) {
// The cluster centroids //
var self = this;
this.location = initialLocation;
this.label = label;
this.updateLocation = function(points) {
var pointsWithThisCentroid = points.filter(function(point) { return point.label == self.label });
if (pointsWithThisCentroid.length > 0) {
self.location = averageLocation(pointsWithThisCentroid);
}
};
};
var data = [];
// Our data list is list of lists. The small list being each (x,y) point
for (var i = 0; i < x.length; i++) {
data.push([x[i],y[i]])
}
// initialize point objects with data
var points = data.map(function(vector) { return new Point(vector) });
// intialize centroids
var centroids = [];
for (var i = 0; i < k; i++) {
centroids.push(new Centroid(points[i % points.length].location, i));
};
// update labels and centroid locations until convergence
for (var iter = 0; iter < iterations; iter++) {
points.forEach(function(point) { point.updateLabel(centroids) });
centroids.forEach(function(centroid) { centroid.updateLocation(points) });
};
// return the cluster labels.
var labels = []
for (var i = 0; i < points.length; i++) {
labels.push(points[i].label)
}
return labels;
""";
--
-- *** In this query, we create two subtables, one for each gene of
-- interest, then create a set of arrays in joining the two tables.
-- We call the UDF using the arrays, and get a result back
-- made of arrays.
--
-- Due to a technical issue we save the table of arrays to
-- to a personal dataset, then unpack it. ***
--
WITH
-- gene1, the first subtable
--
gene1 AS (
SELECT
ROW_NUMBER() OVER() row_number,
AliquotBarcode AS barcode1,
HGNC_gene_symbol AS gene_id1,
AVG(LOG(normalized_count+1, 2)) AS count1
FROM
`isb-cgc.tcga_201607_beta.mRNA_UNC_RSEM`
WHERE
Study = 'BRCA'
AND SampleTypeLetterCode = 'TP'
AND HGNC_gene_symbol = 'ESR1'
AND normalized_count >= 0
GROUP BY
AliquotBarcode,
gene_id1),
--
-- gene2, the second subtable
--
gene2 AS (
SELECT
AliquotBarcode AS barcode2,
HGNC_gene_symbol AS gene_id2,
AVG(LOG(normalized_count+1, 2)) AS count2
FROM
`isb-cgc.tcga_201607_beta.mRNA_UNC_RSEM`
WHERE
Study = 'BRCA'
AND SampleTypeLetterCode = 'TP'
AND HGNC_gene_symbol = 'EGFR'
AND normalized_count >= 0
GROUP BY
AliquotBarcode,
HGNC_gene_symbol),
--
-- Then we create a table of arrays
-- and join the two gene tables.
-- ** We need to make sure all the arrays are constructed using the same index. **
--
arrayTable AS (
SELECT
ARRAY_AGG(m1.row_number ORDER BY m1.barcode1) AS arrayn,
ARRAY_AGG(m1.barcode1 ORDER BY m1.barcode1) AS barcode,
ARRAY_AGG(count1 ORDER BY m1.barcode1) AS esr1,
ARRAY_AGG(count2 ORDER BY m1.barcode1) AS egfr
FROM
gene1 AS m1
JOIN
gene2 AS m2
ON
m1.barcode1 = m2.barcode2 )
--
-- Now we call the k-means UDF.
--
SELECT
arrayn,
barcode,
esr1,
egfr,
kMeans(esr1, egfr, 200.0, 2.0) AS cluster
FROM
arrayTable
--
-- save the resulting table to a personal dataset
@ernestoongaro

This comment has been minimized.

Copy link

commented Jan 8, 2019

Hi @smrgit is there a license attached to this code?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.