Created
November 19, 2012 15:58
-
-
Save andrewxhill/4111438 to your computer and use it in GitHub Desktop.
Kmeans better still, ~5 seconds for 50,000 points
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--Number of iterations is at 20, which is probably very high | |
--You can reduce it by replace 20 in both iterations < and iteration = | |
WITH RECURSIVE dims AS (SELECT 3 as d, 4 as i), --d^2 max # clusters, i = iterations to refine | |
series AS (SELECT GENERATE_SERIES(1,d*d) s FROM dims), | |
simple AS (SELECT distinct(ST_SnapToGrid(the_geom,0.001,0.001)) as the_geom FROM occurrence_search_coords), | |
env AS (SELECT ST_Envelope(ST_Collect(the_geom)) as geom FROM simple), | |
grid AS (SELECT ST_Centroid(CDB_RectangleGrid( | |
geom, | |
(ST_XMax(geom)-ST_XMin(geom))/d, | |
(ST_YMax(geom)-ST_YMin(geom))/d | |
)) as geom | |
--row_number() OVER (ORDER BY GENERATE_SERIES(1,d*d) ASC) | |
FROM env,dims | |
), | |
clusters AS ( | |
SELECT geom,row_number() OVER (ORDER BY geom ASC) as id FROM grid | |
), | |
means(centroid, id, iteration) AS ( | |
SELECT Array_agg(geom) as centroid, ARRAY_agg(id) as id, 1 FROM clusters | |
UNION ALL | |
(SELECT | |
Array( | |
SELECT | |
(SELECT ST_Centroid(ST_Collect(the_geom)) FROM simple k WHERE | |
ST_Distance(k.the_geom,m.centroid[s]) = ( | |
SELECT min(ST_Distance(p,k.the_geom)) FROM UNNEST(m.centroid) p | |
) | |
) | |
FROM series s | |
), | |
m.id, | |
iteration+1 | |
FROM means m, dims | |
WHERE | |
iteration < i | |
) | |
), | |
result AS (SELECT unnest(centroid) as centroid, unnest(id) as id FROM means, dims WHERE iteration = i) | |
--TO get the centroid of the clusters | |
--SELECT centroid as the_geom,id, iteration FROM result | |
--All points for each cluster | |
--SELECT k.the_geom_webmercator, (SELECT m.id FROM result m ORDER BY m.centroid <-> k.the_geom LIMIT 1 ) as kmean_id | |
--FROM occurrence_search_coords k | |
--Write results to a new column called kmeans_id (type numeric) | |
UPDATE occurrence_search_coords SET | |
kmeans_id = (SELECT m.id FROM result m ORDER BY m.centroid <-> k.the_geom LIMIT 1 ) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment