Skip to content

Instantly share code, notes, and snippets.

@andrewxhill
Created January 3, 2014 18:18
Show Gist options
  • Save andrewxhill/8243282 to your computer and use it in GitHub Desktop.
Save andrewxhill/8243282 to your computer and use it in GitHub Desktop.
First pass at a simple point clustering algo for plpgsql
CREATE TYPE axh_cdb_row_clusters (
cartodb_id int,
cluster_id int,
size int
);
CREATE OR REPLACE FUNCTION axh_cdb_row_cluster(_t text, zoom INT)
RETURNS SETOF axh_cdb_row_clusters AS $$
DECLARE
sql TEXT;
cluster_n INT := 1;
clustered INT[];
clusterA INT[];
clusterB INT[];
history INT[];
seed INT;
rows INT;
alen INT;
marker INT;
resolution INT := 10;
resolution_v INT;
ret axh_cdb_row_clusters;
BEGIN
-- Get the total number of rows we are looking at
sql := 'SELECT count(*) FROM (' || _t || ') t';
EXECUTE sql INTO rows;
-- Get the first value to start with
WHILE COALESCE(array_length(clustered, 1),0) < rows LOOP
EXECUTE 'SELECT min(cartodb_id) FROM (' || _t || ') t WHERE ($1 IS NULL OR NOT (cartodb_id = ANY ($1)))' INTO seed USING clustered;
-- Get the first cluster
sql := 'SELECT array_agg(cartodb_id) FROM (' || _t || ') t WHERE ST_Distance(the_geom::geography, (SELECT the_geom::geography FROM (' || _t || ') g WHERE cartodb_id = ' || seed || ')) < '|| resolution ||' * CDB_XYZ_Resolution( '|| zoom || ')';
EXECUTE sql INTO clusterA;
-- Reset variable resolution
resolution_v := resolution;
alen := array_length(clusterA, 1);
-- Evaluate the cluster
WHILE alen > resolution_v LOOP
--Extend the resolution
resolution_v := LEAST(resolution_v + resolution, resolution_v + (alen / resolution_v));
EXECUTE 'SELECT array_agg(cartodb_id) FROM (' || _t || ') t WHERE NOT (cartodb_id = ANY ($1)) AND ST_Distance(the_geom::geography, (SELECT the_geom::geography FROM (' || _t || ') g WHERE cartodb_id = ' || seed || ')) < '|| resolution_v ||' * CDB_XYZ_Resolution( '|| zoom || ')' INTO clusterB USING clusterA;
alen := array_length(clusterB, 1);
clusterA = array_cat(clusterA, clusterB);
END LOOP;
SELECT seed cartodb_id, cluster_n cluster_id, array_length(clusterA, 1) size INTO ret.cartodb_id, ret.cluster_id, ret.size;
RETURN NEXT ret;
cluster_n := cluster_n + 1;
clustered = array_cat(clustered, clusterA);
END LOOP;
RETURN;
END
$$ language plpgsql;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment