Skip to content

Instantly share code, notes, and snippets.

@andrewxhill
Created November 29, 2012 19:44
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andrewxhill/4171372 to your computer and use it in GitHub Desktop.
Save andrewxhill/4171372 to your computer and use it in GitHub Desktop.
Super naive but fast kmeans
--50,000 records ~ 1.2s, 500,000 records ~ 25s
--Number of iterations is at 20, which is probably very high
--You can reduce it by replace 20 in both iterations < and iteration =
WITH RECURSIVE dims AS (SELECT 3 as d, 4 as i), --d^2 max # clusters, i = iterations to refine
series AS (SELECT GENERATE_SERIES(1,d*d) s FROM dims),
bx AS (SELECT ST_Envelope(ST_Collect(the_geom)) as bx_g FROM occurrence_search_coords),
grd AS (SELECT CDB_RectangleGrid(bx_g,((ST_XMax(bx_g)-ST_XMin(bx_g))/50),((ST_YMax(bx_g)-ST_YMin(bx_g))/50) ) as grd_g FROM bx),
simple AS (SELECT ST_Centroid(grd_g) as the_geom FROM grd,occurrence_search_coords WHERE ST_Intersects(grd_g,the_geom) GROUP BY grd_g),
env AS (SELECT ST_Envelope(ST_Collect(the_geom)) as geom FROM simple),
grid AS (SELECT ST_Centroid(CDB_RectangleGrid(
geom,
(ST_XMax(geom)-ST_XMin(geom))/d,
(ST_YMax(geom)-ST_YMin(geom))/d
)) as geom
--row_number() OVER (ORDER BY GENERATE_SERIES(1,d*d) ASC)
FROM env,dims
),
clusters AS (
SELECT geom,row_number() OVER (ORDER BY geom ASC) as id FROM grid
),
means(centroid, id, iteration) AS (
SELECT Array_agg(geom) as centroid, ARRAY_agg(id) as id, 1 FROM clusters
UNION ALL
(SELECT
Array(
SELECT
(SELECT ST_Centroid(ST_Collect(the_geom)) FROM simple k WHERE
ST_Distance(k.the_geom,m.centroid[s]) = (
SELECT min(ST_Distance(p,k.the_geom)) FROM UNNEST(m.centroid) p
)
)
FROM series s
),
m.id,
iteration+1
FROM means m, dims
WHERE
iteration < i
)
),
result AS (SELECT unnest(centroid) as centroid, unnest(id) as id FROM means, dims WHERE iteration = i)
--TO get the centroid of the clusters
--SELECT centroid as the_geom,id, iteration FROM result
--All points for each cluster
SELECT k.the_geom_webmercator, (SELECT m.id FROM result m ORDER BY m.centroid <-> k.the_geom LIMIT 1 ) as kmean_id FROM occurrence_search_coords k
--Write results to a new column called kmeans_id (type numeric)
--UPDATE occurrence_search_coords SET kmeans_id = (SELECT m.id FROM result m ORDER BY m.centroid <-> k.the_geom LIMIT 1 )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment