Skip to content

Instantly share code, notes, and snippets.

update "AUSTRALIAN_WEATHER"
set "CLUSTER_INDEX" = WHICH_CLUSTER("LAT","LON",
PARSE_JSON('{"0":{"x":-41.89293179866893,"y":146.8715104775374},
"1":{"x":-33.45503329171531,"y":151.1819215192531},
"2":{"x":-30.44866734273322,"y":117.7135704718005},
"3":{"x":-27.26701719379047,"y":150.1829381794606},
"4":{"x":-36.60725172494781,"y":143.3490676328288}}'));
with x as (
select cluster_index,object_construct('x',avg("LAT")::string,'y',avg("LON")::string) as KMEANS_CLUSTERS
from "AUSTRALIAN_WEATHER"
group by cluster_index
)
select object_agg(cluster_index,KMEANS_CLUSTERS) as NEW_CLUSTERS from x
-- The purpose of WHICH_CLUSTER is to assign each input record to the closest cluster out of a given list of clusters.
-- This forms part of the k-means clustering algorithm.
-- For each pair of values, the Euclidian Distance formula is used to determine the closest cluster,
-- and the index of that cluster is returned.
create or replace function WHICH_CLUSTER(X float, Y float, CLUSTER_CENTROIDS variant)
returns float
language javascript
AS '
function euclidianDistance(x1,x2,y1,y2){
return Math.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2);
call k_means('AUSTRALIAN_WEATHER', 'LAT,LON','CLUSTER_INDEX', 5,10)
alter table AUSTRALIAN_WEATHER
add column CLUSTER_INDEX integer
-- The purpose of UPDATE_CLUSTERS is to assign each input record to the closest cluster out of a list of clusters,
-- and gather information required to recalculate new centroids. This forms part of the k-means clustering algorithm
-- As a User Defined Table Function, it does this by traversing a table, and for each record:
-- 1) Determine which of the clusters in CLUSTER_CENTROIDS it belongs to (closest in 2D distance)
-- 2) Update a running total of values for that cluster, so that the new centroids can be calculated later
-- The reason we output totals in this manner rather than just new centroids, is so that the processing can be parallelized.
-- So UPDATE_CLUSTERS acts like the Map side of a MapReduce.
-- A worked example of a couple of iterations, where the rows looks like this:
--+------+------+----------------------------------------------------------+
-- Initial setup
create table KMEANS_CLUSTERS (
TABLE_NAME varchar,
COLUMNS varchar,
CENTROIDS variant
);
create or replace table AUSTRALIAN_WEATHER as (
with t as (
select V:city:coord:lat::float as LAT,
V:city:coord:lon::float as LON
from SNOWFLAKE_SAMPLE_DATA.WEATHER.HOURLY_16_TOTAL
where V:city:country::varchar='AU'
limit 1000000
)
select * from t sample (10000 rows)
)
with partioned_result as (
select gini.*
from "ML_BENCHMARKING"."PUBLIC"."HIGGS_BOSON_BUCKETED" ,
table(gini_calc(parse_json('{ selectionCriteriaAttribute: "PRI_jet_subleading_pt", selectionCriteriaPredicate: ">", selectionCriteriaValue: 1, children: [ { selectionCriteriaAttribute: "PRI_jet_leading_pt", selectionCriteriaPredicate: "<=", selectionCriteriaValue: 3, splitCandidates: { der_mass_jet_jet: [7,8,9] } }, { selectionCriteriaAttribute: "PRI_jet_leading_pt", selectionCriteriaPredicate: ">", selectionCriteriaValue: 3, splitCandidates: { der_mass_jet_jet: [7,8,9] } } ] }'),
object_construct('der_mass_jet_jet',der_mass_jet_jet,'PRI_jet_leading_pt',PRI_jet_leading_pt, 'PRI_jet_subleading_pt',PRI_jet_subleading_pt,'label',label
-- This UDTF will return all information required for a gini-based branch split decision.
-- The input object describes a tree in the process of being built.
-- Each node contains a subset filter to qualify/disqualify each row, and either
-- 1) a list of children (keep drilling)
-- 2) a map of split candidates (do the gini calcs)
-- 3) nothing else (tree terminated here previously)
-- Example:
-- {
-- rootBranch: {
-- subsetFilter: {