Skip to content

Instantly share code, notes, and snippets.

@andrewxhill
Created November 19, 2012 15:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andrewxhill/4111438 to your computer and use it in GitHub Desktop.
Save andrewxhill/4111438 to your computer and use it in GitHub Desktop.
Kmeans better still, ~5 seconds for 50,000 points
--Number of iterations is at 20, which is probably very high
--You can reduce it by replace 20 in both iterations < and iteration =
WITH RECURSIVE dims AS (SELECT 3 as d, 4 as i), --d^2 max # clusters, i = iterations to refine
series AS (SELECT GENERATE_SERIES(1,d*d) s FROM dims),
simple AS (SELECT distinct(ST_SnapToGrid(the_geom,0.001,0.001)) as the_geom FROM occurrence_search_coords),
env AS (SELECT ST_Envelope(ST_Collect(the_geom)) as geom FROM simple),
grid AS (SELECT ST_Centroid(CDB_RectangleGrid(
geom,
(ST_XMax(geom)-ST_XMin(geom))/d,
(ST_YMax(geom)-ST_YMin(geom))/d
)) as geom
--row_number() OVER (ORDER BY GENERATE_SERIES(1,d*d) ASC)
FROM env,dims
),
clusters AS (
SELECT geom,row_number() OVER (ORDER BY geom ASC) as id FROM grid
),
means(centroid, id, iteration) AS (
SELECT Array_agg(geom) as centroid, ARRAY_agg(id) as id, 1 FROM clusters
UNION ALL
(SELECT
Array(
SELECT
(SELECT ST_Centroid(ST_Collect(the_geom)) FROM simple k WHERE
ST_Distance(k.the_geom,m.centroid[s]) = (
SELECT min(ST_Distance(p,k.the_geom)) FROM UNNEST(m.centroid) p
)
)
FROM series s
),
m.id,
iteration+1
FROM means m, dims
WHERE
iteration < i
)
),
result AS (SELECT unnest(centroid) as centroid, unnest(id) as id FROM means, dims WHERE iteration = i)
--TO get the centroid of the clusters
--SELECT centroid as the_geom,id, iteration FROM result
--All points for each cluster
--SELECT k.the_geom_webmercator, (SELECT m.id FROM result m ORDER BY m.centroid <-> k.the_geom LIMIT 1 ) as kmean_id
--FROM occurrence_search_coords k
--Write results to a new column called kmeans_id (type numeric)
UPDATE occurrence_search_coords SET
kmeans_id = (SELECT m.id FROM result m ORDER BY m.centroid <-> k.the_geom LIMIT 1 )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment