James Weakley jamesweakley

## which_cluster_in_update.sql
update "AUSTRALIAN_WEATHER"
set "CLUSTER_INDEX" = WHICH_CLUSTER("LAT","LON",
  PARSE_JSON('{"0":{"x":-41.89293179866893,"y":146.8715104775374},
               "1":{"x":-33.45503329171531,"y":151.1819215192531},
               "2":{"x":-30.44866734273322,"y":117.7135704718005},
               "3":{"x":-27.26701719379047,"y":150.1829381794606},
               "4":{"x":-36.60725172494781,"y":143.3490676328288}}'));

## new_centroids.sql
  with x as (
  select cluster_index,object_construct('x',avg("LAT")::string,'y',avg("LON")::string) as KMEANS_CLUSTERS
  from "AUSTRALIAN_WEATHER"
  group by cluster_index
  )
  select object_agg(cluster_index,KMEANS_CLUSTERS) as NEW_CLUSTERS from x

## which_cluster.sql
-- The purpose of WHICH_CLUSTER is to assign each input record to the closest cluster out of a given list of clusters.
-- This forms part of the k-means clustering algorithm.
-- For each pair of values, the Euclidian Distance formula is used to determine the closest cluster,
-- and the index of that cluster is returned.
create or replace function WHICH_CLUSTER(X float, Y float, CLUSTER_CENTROIDS variant)
    returns float
    language javascript
    AS '
    function euclidianDistance(x1,x2,y1,y2){
      return Math.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2);

## kmeans_call_weather.sql
call k_means('AUSTRALIAN_WEATHER', 'LAT,LON','CLUSTER_INDEX', 5,10)

## add_kmeans_cluster_index_column.sql
alter table AUSTRALIAN_WEATHER
add column CLUSTER_INDEX integer

## kmeans_update_clusters.sql
-- The purpose of UPDATE_CLUSTERS is to assign each input record to the closest cluster out of a list of clusters,
-- and gather information required to recalculate new centroids. This forms part of the k-means clustering algorithm
-- As a User Defined Table Function, it does this by traversing a table, and for each record:
-- 1) Determine which of the clusters in CLUSTER_CENTROIDS it belongs to (closest in 2D distance)
-- 2) Update a running total of values for that cluster, so that the new centroids can be calculated later
-- The reason we output totals in this manner rather than just new centroids, is so that the processing can be parallelized.
-- So UPDATE_CLUSTERS acts like the Map side of a MapReduce.

-- A worked example of a couple of iterations, where the rows looks like this:
--+------+------+----------------------------------------------------------+

## kmeans_clusters_table.sql
-- Initial setup
create table KMEANS_CLUSTERS (
TABLE_NAME varchar,
COLUMNS varchar,
CENTROIDS variant
);

## weather_sample.sql
create or replace table AUSTRALIAN_WEATHER as (
with t as (
  select V:city:coord:lat::float as LAT,
         V:city:coord:lon::float as LON
  from SNOWFLAKE_SAMPLE_DATA.WEATHER.HOURLY_16_TOTAL
  where V:city:country::varchar='AU'
  limit 1000000
  )
  select * from t sample (10000 rows)
)

## gini_calc_call.sql
with partioned_result as (
  select gini.*
from "ML_BENCHMARKING"."PUBLIC"."HIGGS_BOSON_BUCKETED" ,
  table(gini_calc(parse_json('{    selectionCriteriaAttribute: "PRI_jet_subleading_pt",        selectionCriteriaPredicate: ">",        selectionCriteriaValue: 1,        children: [            {                selectionCriteriaAttribute: "PRI_jet_leading_pt",                selectionCriteriaPredicate: "<=",                selectionCriteriaValue: 3,                splitCandidates: {                    der_mass_jet_jet: [7,8,9]                }            },            {                selectionCriteriaAttribute: "PRI_jet_leading_pt",                selectionCriteriaPredicate: ">",                selectionCriteriaValue: 3,                splitCandidates: {                    der_mass_jet_jet: [7,8,9]                }            }        ]    }'),
                object_construct('der_mass_jet_jet',der_mass_jet_jet,'PRI_jet_leading_pt',PRI_jet_leading_pt, 'PRI_jet_subleading_pt',PRI_jet_subleading_pt,'label',label

## gini_calc_udtf.sql
-- This UDTF will return all information required for a gini-based branch split decision.
-- The input object describes a tree in the process of being built.
-- Each node contains a subset filter to qualify/disqualify each row, and either
-- 1) a list of children (keep drilling)
-- 2) a map of split candidates (do the gini calcs)
-- 3) nothing else (tree terminated here previously)
-- Example:
-- {
--   rootBranch: {
--     subsetFilter: {
	update "AUSTRALIAN_WEATHER"
	set "CLUSTER_INDEX" = WHICH_CLUSTER("LAT","LON",
	PARSE_JSON('{"0":{"x":-41.89293179866893,"y":146.8715104775374},
	"1":{"x":-33.45503329171531,"y":151.1819215192531},
	"2":{"x":-30.44866734273322,"y":117.7135704718005},
	"3":{"x":-27.26701719379047,"y":150.1829381794606},
	"4":{"x":-36.60725172494781,"y":143.3490676328288}}'));
	with x as (
	select cluster_index,object_construct('x',avg("LAT")::string,'y',avg("LON")::string) as KMEANS_CLUSTERS
	from "AUSTRALIAN_WEATHER"
	group by cluster_index
	)
	select object_agg(cluster_index,KMEANS_CLUSTERS) as NEW_CLUSTERS from x
	-- The purpose of WHICH_CLUSTER is to assign each input record to the closest cluster out of a given list of clusters.
	-- This forms part of the k-means clustering algorithm.
	-- For each pair of values, the Euclidian Distance formula is used to determine the closest cluster,
	-- and the index of that cluster is returned.
	create or replace function WHICH_CLUSTER(X float, Y float, CLUSTER_CENTROIDS variant)
	returns float
	language javascript
	AS '
	function euclidianDistance(x1,x2,y1,y2){
	return Math.sqrt((x1 - x2) 2 + (y1 - y2) 2);
	alter table AUSTRALIAN_WEATHER
	add column CLUSTER_INDEX integer
	-- The purpose of UPDATE_CLUSTERS is to assign each input record to the closest cluster out of a list of clusters,
	-- and gather information required to recalculate new centroids. This forms part of the k-means clustering algorithm
	-- As a User Defined Table Function, it does this by traversing a table, and for each record:
	-- 1) Determine which of the clusters in CLUSTER_CENTROIDS it belongs to (closest in 2D distance)
	-- 2) Update a running total of values for that cluster, so that the new centroids can be calculated later
	-- The reason we output totals in this manner rather than just new centroids, is so that the processing can be parallelized.
	-- So UPDATE_CLUSTERS acts like the Map side of a MapReduce.

	-- A worked example of a couple of iterations, where the rows looks like this:
	--+------+------+----------------------------------------------------------+
	-- Initial setup
	create table KMEANS_CLUSTERS (
	TABLE_NAME varchar,
	COLUMNS varchar,
	CENTROIDS variant
	);
	create or replace table AUSTRALIAN_WEATHER as (
	with t as (
	select V:city:coord:lat::float as LAT,
	V:city:coord:lon::float as LON
	from SNOWFLAKE_SAMPLE_DATA.WEATHER.HOURLY_16_TOTAL
	where V:city:country::varchar='AU'
	limit 1000000
	)
	select * from t sample (10000 rows)
	)
	with partioned_result as (
	select gini.*
	from "ML_BENCHMARKING"."PUBLIC"."HIGGS_BOSON_BUCKETED" ,
	table(gini_calc(parse_json('{ selectionCriteriaAttribute: "PRI_jet_subleading_pt", selectionCriteriaPredicate: ">", selectionCriteriaValue: 1, children: [ { selectionCriteriaAttribute: "PRI_jet_leading_pt", selectionCriteriaPredicate: "<=", selectionCriteriaValue: 3, splitCandidates: { der_mass_jet_jet: [7,8,9] } }, { selectionCriteriaAttribute: "PRI_jet_leading_pt", selectionCriteriaPredicate: ">", selectionCriteriaValue: 3, splitCandidates: { der_mass_jet_jet: [7,8,9] } } ] }'),
	object_construct('der_mass_jet_jet',der_mass_jet_jet,'PRI_jet_leading_pt',PRI_jet_leading_pt, 'PRI_jet_subleading_pt',PRI_jet_subleading_pt,'label',label
	-- This UDTF will return all information required for a gini-based branch split decision.
	-- The input object describes a tree in the process of being built.
	-- Each node contains a subset filter to qualify/disqualify each row, and either
	-- 1) a list of children (keep drilling)
	-- 2) a map of split candidates (do the gini calcs)
	-- 3) nothing else (tree terminated here previously)
	-- Example:
	-- {
	-- rootBranch: {
	-- subsetFilter: {