Skip to content

Instantly share code, notes, and snippets.

@andrewxhill
Created October 11, 2012 19:55
Show Gist options
  • Save andrewxhill/3875088 to your computer and use it in GitHub Desktop.
Save andrewxhill/3875088 to your computer and use it in GitHub Desktop.
Straight SQL Kmean
--Number of iterations is at 20, which is probably very high
--You can reduce it by replace 20 in both iterations < and iteration =
WITH RECURSIVE dims AS (SELECT 3 as d),
env AS (SELECT ST_Envelope(ST_Collect(the_geom)) as geom FROM kmean),
grid AS (SELECT ST_Centroid(CDB_RectangleGrid(
geom,
(ST_XMax(geom)-ST_XMin(geom))/d,
(ST_YMax(geom)-ST_YMin(geom))/d
)) as geom
--row_number() OVER (ORDER BY GENERATE_SERIES(1,d*d) ASC)
FROM env,dims
),
clusters AS (
SELECT geom,row_number() OVER (ORDER BY geom ASC) as id FROM grid
),
means(centroid, id, iteration) AS (
SELECT Array_agg(geom) as centroid, ARRAY_agg(id) as id, 1 FROM clusters
UNION ALL
(SELECT
ARRAY[
(SELECT ST_Centroid(ST_Collect(the_geom)) FROM kmean k WHERE
ST_Distance(k.the_geom,m.centroid[1]) = (SELECT min(ST_Distance(p,k.the_geom)) FROM UNNEST(m.centroid) p)
),
(SELECT ST_Centroid(ST_Collect(the_geom)) FROM kmean k WHERE
ST_Distance(k.the_geom,m.centroid[2]) = (SELECT min(ST_Distance(p,k.the_geom)) FROM UNNEST(m.centroid) p)
),
(SELECT ST_Centroid(ST_Collect(the_geom)) FROM kmean k WHERE
ST_Distance(k.the_geom,m.centroid[3]) = (SELECT min(ST_Distance(p,k.the_geom)) FROM UNNEST(m.centroid) p)
),
(SELECT ST_Centroid(ST_Collect(the_geom)) FROM kmean k WHERE
ST_Distance(k.the_geom,m.centroid[4]) = (SELECT min(ST_Distance(p,k.the_geom)) FROM UNNEST(m.centroid) p)
),
(SELECT ST_Centroid(ST_Collect(the_geom)) FROM kmean k WHERE
ST_Distance(k.the_geom,m.centroid[5]) = (SELECT min(ST_Distance(p,k.the_geom)) FROM UNNEST(m.centroid) p)
),
(SELECT ST_Centroid(ST_Collect(the_geom)) FROM kmean k WHERE
ST_Distance(k.the_geom,m.centroid[6]) = (SELECT min(ST_Distance(p,k.the_geom)) FROM UNNEST(m.centroid) p)
),
(SELECT ST_Centroid(ST_Collect(the_geom)) FROM kmean k WHERE
ST_Distance(k.the_geom,m.centroid[7]) = (SELECT min(ST_Distance(p,k.the_geom)) FROM UNNEST(m.centroid) p)
),
(SELECT ST_Centroid(ST_Collect(the_geom)) FROM kmean k WHERE
ST_Distance(k.the_geom,m.centroid[8]) = (SELECT min(ST_Distance(p,k.the_geom)) FROM UNNEST(m.centroid) p)
),
(SELECT ST_Centroid(ST_Collect(the_geom)) FROM kmean k WHERE
ST_Distance(k.the_geom,m.centroid[9]) = (SELECT min(ST_Distance(p,k.the_geom)) FROM UNNEST(m.centroid) p)
)
],
m.id,
iteration+1
FROM means m
WHERE
iteration < 20
)
),
result AS (SELECT unnest(centroid) as centroid, unnest(id) as id FROM means WHERE iteration = 20)
--TO get the centroid of the clusters
--SELECT centroid as the_geom,id, iteration FROM result
--All points for each cluster
SELECT k.the_geom_webmercator, (SELECT m.id FROM result m ORDER BY m.centroid <-> k.the_geom LIMIT 1 ) as id FROM kmean k
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment