smrgit/kMeans_in_BQ.sql

## kMeans_in_BQ.sql
  CREATE TEMPORARY FUNCTION

    -- In this function, we're going to be working on arrays of values.
    -- we're also going to define a set of functions 'inside' the kMeans.

    -- *heavily borrowing from https://github.com/NathanEpstein/clusters* --

    kMeans(x ARRAY<FLOAT64>,  -- ESR1 gene expression
           y ARRAY<FLOAT64>,  -- EGFR gene expression
           iterations FLOAT64,  -- the number of iterations
           k FLOAT64)           -- the number of clusters

    RETURNS ARRAY<FLOAT64>  -- returns the cluster assignments

    LANGUAGE js AS """
    'use strict'


    function sumOfSquareDiffs(oneVector, anotherVector) {
      // the sum of squares error //
      var squareDiffs = oneVector.map(function(component, i) {
        return Math.pow(component - anotherVector[i], 2);
      });
      return squareDiffs.reduce(function(a, b) { return a + b }, 0);
    };

  function mindex(array) {
    // returns the index to the minimum value in the array
    var min = array.reduce(function(a, b) {
      return Math.min(a, b);
    });
    return array.indexOf(min);
  };

  function sumVectors(a, b) {
    // The map function gets used frequently in JavaScript
    return a.map(function(val, i) { return val + b[i] });
  };

  function averageLocation(points) {
    // Take all the points assigned to a cluster
    // and find the averge center point.
    // This gets used to update the cluster centroids.
    var zeroVector = points[0].location.map(function() { return 0 });
    var locations = points.map(function(point) { return point.location });
    var vectorSum = locations.reduce(function(a, b) { return sumVectors(a, b) }, zeroVector);
    return vectorSum.map(function(val) { return val / points.length });
  };

  function Point(location) {
    // A point object, each sample is represented as a point //
    var self = this;
    this.location = location;
    this.label = 1;
    this.updateLabel = function(centroids) {
      var distancesSquared = centroids.map(function(centroid) {
        return sumOfSquareDiffs(self.location, centroid.location);
      });
      self.label = mindex(distancesSquared);
    };
  };


  function Centroid(initialLocation, label) {
    // The cluster centroids //
    var self = this;
    this.location = initialLocation;
    this.label = label;
    this.updateLocation = function(points) {
      var pointsWithThisCentroid = points.filter(function(point) { return point.label == self.label });
      if (pointsWithThisCentroid.length > 0) {
        self.location = averageLocation(pointsWithThisCentroid);
      }
    };
  };


  var data = [];

  // Our data list is list of lists. The small list being each (x,y) point
  for (var i = 0; i < x.length; i++) {
    data.push([x[i],y[i]])
  }

  // initialize point objects with data
  var points = data.map(function(vector) { return new Point(vector) });


  // intialize centroids
  var centroids = [];
  for (var i = 0; i < k; i++) {
    centroids.push(new Centroid(points[i % points.length].location, i));
  };


  // update labels and centroid locations until convergence
  for (var iter = 0; iter < iterations; iter++) {
    points.forEach(function(point) { point.updateLabel(centroids) });
    centroids.forEach(function(centroid) { centroid.updateLocation(points) });
  };

  // return the cluster labels.
  var labels = []
  for (var i = 0; i < points.length; i++) {
    labels.push(points[i].label)
  }

  return labels;

  """;
  --
  -- *** In this query, we create two subtables, one for each gene of
  --     interest, then create a set of arrays in joining the two tables.
  --     We call the UDF using the arrays, and get a result back
  --     made of arrays.
  --
  --     Due to a technical issue we save the table of arrays to
  --     to a personal dataset, then unpack it. ***
  --
  WITH
    -- gene1, the first subtable
    --
    gene1 AS (
    SELECT
      ROW_NUMBER() OVER() row_number,
      AliquotBarcode AS barcode1,
      HGNC_gene_symbol AS gene_id1,
      AVG(LOG(normalized_count+1, 2)) AS count1
    FROM
      `isb-cgc.tcga_201607_beta.mRNA_UNC_RSEM`
    WHERE
      Study = 'BRCA'
      AND SampleTypeLetterCode = 'TP'
      AND HGNC_gene_symbol = 'ESR1'
      AND normalized_count >= 0
    GROUP BY
      AliquotBarcode,
      gene_id1),
    --
    -- gene2, the second subtable
    --
    gene2 AS (
    SELECT
      AliquotBarcode AS barcode2,
      HGNC_gene_symbol AS gene_id2,
      AVG(LOG(normalized_count+1, 2)) AS count2
    FROM
      `isb-cgc.tcga_201607_beta.mRNA_UNC_RSEM`
    WHERE
      Study = 'BRCA'
      AND SampleTypeLetterCode = 'TP'
      AND HGNC_gene_symbol = 'EGFR'
      AND normalized_count >= 0
    GROUP BY
      AliquotBarcode,
      HGNC_gene_symbol),
    --
    -- Then we create a table of arrays
    -- and join the two gene tables.
    -- ** We need to make sure all the arrays are constructed using the same index. **
    --
    arrayTable AS (
    SELECT
      ARRAY_AGG(m1.row_number ORDER BY m1.barcode1) AS arrayn,
      ARRAY_AGG(m1.barcode1 ORDER BY m1.barcode1) AS barcode,
      ARRAY_AGG(count1 ORDER BY m1.barcode1) AS esr1,
      ARRAY_AGG(count2 ORDER BY m1.barcode1) AS egfr
    FROM
      gene1 AS m1
    JOIN
      gene2 AS m2
    ON
      m1.barcode1 = m2.barcode2 )
    --
    -- Now we call the k-means UDF.
    --
  SELECT
    arrayn,
    barcode,
    esr1,
    egfr,
    kMeans(esr1, egfr, 200.0, 2.0) AS cluster
  FROM
    arrayTable
    --
    -- save the resulting table to a personal dataset
	CREATE TEMPORARY FUNCTION

	-- In this function, we're going to be working on arrays of values.
	-- we're also going to define a set of functions 'inside' the kMeans.

	-- heavily borrowing from https://github.com/NathanEpstein/clusters --

	kMeans(x ARRAY<FLOAT64>, -- ESR1 gene expression
	y ARRAY<FLOAT64>, -- EGFR gene expression
	iterations FLOAT64, -- the number of iterations
	k FLOAT64) -- the number of clusters

	RETURNS ARRAY<FLOAT64> -- returns the cluster assignments

	LANGUAGE js AS """
	'use strict'


	function sumOfSquareDiffs(oneVector, anotherVector) {
	// the sum of squares error //
	var squareDiffs = oneVector.map(function(component, i) {
	return Math.pow(component - anotherVector[i], 2);
	});
	return squareDiffs.reduce(function(a, b) { return a + b }, 0);
	};

	function mindex(array) {
	// returns the index to the minimum value in the array
	var min = array.reduce(function(a, b) {
	return Math.min(a, b);
	});
	return array.indexOf(min);
	};

	function sumVectors(a, b) {
	// The map function gets used frequently in JavaScript
	return a.map(function(val, i) { return val + b[i] });
	};

	function averageLocation(points) {
	// Take all the points assigned to a cluster
	// and find the averge center point.
	// This gets used to update the cluster centroids.
	var zeroVector = points[0].location.map(function() { return 0 });
	var locations = points.map(function(point) { return point.location });
	var vectorSum = locations.reduce(function(a, b) { return sumVectors(a, b) }, zeroVector);
	return vectorSum.map(function(val) { return val / points.length });
	};

	function Point(location) {
	// A point object, each sample is represented as a point //
	var self = this;
	this.location = location;
	this.label = 1;
	this.updateLabel = function(centroids) {
	var distancesSquared = centroids.map(function(centroid) {
	return sumOfSquareDiffs(self.location, centroid.location);
	});
	self.label = mindex(distancesSquared);
	};
	};


	function Centroid(initialLocation, label) {
	// The cluster centroids //
	var self = this;
	this.location = initialLocation;
	this.label = label;
	this.updateLocation = function(points) {
	var pointsWithThisCentroid = points.filter(function(point) { return point.label == self.label });
	if (pointsWithThisCentroid.length > 0) {
	self.location = averageLocation(pointsWithThisCentroid);
	}
	};
	};


	var data = [];

	// Our data list is list of lists. The small list being each (x,y) point
	for (var i = 0; i < x.length; i++) {
	data.push([x[i],y[i]])
	}

	// initialize point objects with data
	var points = data.map(function(vector) { return new Point(vector) });


	// intialize centroids
	var centroids = [];
	for (var i = 0; i < k; i++) {
	centroids.push(new Centroid(points[i % points.length].location, i));
	};


	// update labels and centroid locations until convergence
	for (var iter = 0; iter < iterations; iter++) {
	points.forEach(function(point) { point.updateLabel(centroids) });
	centroids.forEach(function(centroid) { centroid.updateLocation(points) });
	};

	// return the cluster labels.
	var labels = []
	for (var i = 0; i < points.length; i++) {
	labels.push(points[i].label)
	}

	return labels;

	""";
	--
	-- *** In this query, we create two subtables, one for each gene of
	-- interest, then create a set of arrays in joining the two tables.
	-- We call the UDF using the arrays, and get a result back
	-- made of arrays.
	--
	-- Due to a technical issue we save the table of arrays to
	-- to a personal dataset, then unpack it. ***
	--
	WITH
	-- gene1, the first subtable
	--
	gene1 AS (
	SELECT
	ROW_NUMBER() OVER() row_number,
	AliquotBarcode AS barcode1,
	HGNC_gene_symbol AS gene_id1,
	AVG(LOG(normalized_count+1, 2)) AS count1
	FROM
	`isb-cgc.tcga_201607_beta.mRNA_UNC_RSEM`
	WHERE
	Study = 'BRCA'
	AND SampleTypeLetterCode = 'TP'
	AND HGNC_gene_symbol = 'ESR1'
	AND normalized_count >= 0
	GROUP BY
	AliquotBarcode,
	gene_id1),
	--
	-- gene2, the second subtable
	--
	gene2 AS (
	SELECT
	AliquotBarcode AS barcode2,
	HGNC_gene_symbol AS gene_id2,
	AVG(LOG(normalized_count+1, 2)) AS count2
	FROM
	`isb-cgc.tcga_201607_beta.mRNA_UNC_RSEM`
	WHERE
	Study = 'BRCA'
	AND SampleTypeLetterCode = 'TP'
	AND HGNC_gene_symbol = 'EGFR'
	AND normalized_count >= 0
	GROUP BY
	AliquotBarcode,
	HGNC_gene_symbol),
	--
	-- Then we create a table of arrays
	-- and join the two gene tables.
	-- We need to make sure all the arrays are constructed using the same index.
	--
	arrayTable AS (
	SELECT
	ARRAY_AGG(m1.row_number ORDER BY m1.barcode1) AS arrayn,
	ARRAY_AGG(m1.barcode1 ORDER BY m1.barcode1) AS barcode,
	ARRAY_AGG(count1 ORDER BY m1.barcode1) AS esr1,
	ARRAY_AGG(count2 ORDER BY m1.barcode1) AS egfr
	FROM
	gene1 AS m1
	JOIN
	gene2 AS m2
	ON
	m1.barcode1 = m2.barcode2 )
	--
	-- Now we call the k-means UDF.
	--
	SELECT
	arrayn,
	barcode,
	esr1,
	egfr,
	kMeans(esr1, egfr, 200.0, 2.0) AS cluster
	FROM
	arrayTable
	--
	-- save the resulting table to a personal dataset