Created
November 29, 2012 19:44
-
-
Save andrewxhill/4171372 to your computer and use it in GitHub Desktop.
Super naive but fast kmeans
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--50,000 records ~ 1.2s, 500,000 records ~ 25s | |
--Number of iterations is at 20, which is probably very high | |
--You can reduce it by replace 20 in both iterations < and iteration = | |
WITH RECURSIVE dims AS (SELECT 3 as d, 4 as i), --d^2 max # clusters, i = iterations to refine | |
series AS (SELECT GENERATE_SERIES(1,d*d) s FROM dims), | |
bx AS (SELECT ST_Envelope(ST_Collect(the_geom)) as bx_g FROM occurrence_search_coords), | |
grd AS (SELECT CDB_RectangleGrid(bx_g,((ST_XMax(bx_g)-ST_XMin(bx_g))/50),((ST_YMax(bx_g)-ST_YMin(bx_g))/50) ) as grd_g FROM bx), | |
simple AS (SELECT ST_Centroid(grd_g) as the_geom FROM grd,occurrence_search_coords WHERE ST_Intersects(grd_g,the_geom) GROUP BY grd_g), | |
env AS (SELECT ST_Envelope(ST_Collect(the_geom)) as geom FROM simple), | |
grid AS (SELECT ST_Centroid(CDB_RectangleGrid( | |
geom, | |
(ST_XMax(geom)-ST_XMin(geom))/d, | |
(ST_YMax(geom)-ST_YMin(geom))/d | |
)) as geom | |
--row_number() OVER (ORDER BY GENERATE_SERIES(1,d*d) ASC) | |
FROM env,dims | |
), | |
clusters AS ( | |
SELECT geom,row_number() OVER (ORDER BY geom ASC) as id FROM grid | |
), | |
means(centroid, id, iteration) AS ( | |
SELECT Array_agg(geom) as centroid, ARRAY_agg(id) as id, 1 FROM clusters | |
UNION ALL | |
(SELECT | |
Array( | |
SELECT | |
(SELECT ST_Centroid(ST_Collect(the_geom)) FROM simple k WHERE | |
ST_Distance(k.the_geom,m.centroid[s]) = ( | |
SELECT min(ST_Distance(p,k.the_geom)) FROM UNNEST(m.centroid) p | |
) | |
) | |
FROM series s | |
), | |
m.id, | |
iteration+1 | |
FROM means m, dims | |
WHERE | |
iteration < i | |
) | |
), | |
result AS (SELECT unnest(centroid) as centroid, unnest(id) as id FROM means, dims WHERE iteration = i) | |
--TO get the centroid of the clusters | |
--SELECT centroid as the_geom,id, iteration FROM result | |
--All points for each cluster | |
SELECT k.the_geom_webmercator, (SELECT m.id FROM result m ORDER BY m.centroid <-> k.the_geom LIMIT 1 ) as kmean_id FROM occurrence_search_coords k | |
--Write results to a new column called kmeans_id (type numeric) | |
--UPDATE occurrence_search_coords SET kmeans_id = (SELECT m.id FROM result m ORDER BY m.centroid <-> k.the_geom LIMIT 1 ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment