Skip to content

Instantly share code, notes, and snippets.

@munro
Created January 27, 2022 00:58
Show Gist options
  • Save munro/374b2538be634532a253a8d28d826070 to your computer and use it in GitHub Desktop.
Save munro/374b2538be634532a253a8d28d826070 to your computer and use it in GitHub Desktop.
BigQuery Connected Components algorithm
-- Does a depth of 10
CREATE OR REPLACE PROCEDURE `mydataset`.connected_components_by_edge(
table_name STRING,
left_column STRING,
right_column STRING,
output_temp_table_name STRING
)
OPTIONS(
description="Connected components algorithm, clusters nodes by unidirected edges by the smallest node ID."
)
BEGIN EXECUTE IMMEDIATE """
CREATE OR REPLACE TEMP TABLE """ || output_temp_table_name || """ AS
WITH edges AS (
SELECT
LEAST(""" || left_column || ", " || right_column || """) AS a,
GREATEST(""" || left_column || ", " || right_column || """) AS b
FROM """ || table_name || """
),
e1 AS (
SELECT LEAST(e.a, follow.a) AS a, e.b FROM edges e JOIN edges follow ON e.a = follow.b WHERE e.a != e.b AND e.a != LEAST(e.a, follow.a)
), e2 AS (
SELECT LEAST(e.a, follow.a) AS a, e.b FROM e1 e JOIN edges follow ON e.a = follow.b WHERE e.a != e.b AND e.a != LEAST(e.a, follow.a)
), e3 AS (
SELECT LEAST(e.a, follow.a) AS a, e.b FROM e2 e JOIN edges follow ON e.a = follow.b WHERE e.a != e.b AND e.a != LEAST(e.a, follow.a)
), e4 AS (
SELECT LEAST(e.a, follow.a) AS a, e.b FROM e3 e JOIN edges follow ON e.a = follow.b WHERE e.a != e.b AND e.a != LEAST(e.a, follow.a)
), e5 AS (
SELECT LEAST(e.a, follow.a) AS a, e.b FROM e4 e JOIN edges follow ON e.a = follow.b WHERE e.a != e.b AND e.a != LEAST(e.a, follow.a)
), e6 AS (
SELECT LEAST(e.a, follow.a) AS a, e.b FROM e5 e JOIN edges follow ON e.a = follow.b WHERE e.a != e.b AND e.a != LEAST(e.a, follow.a)
), e7 AS (
SELECT LEAST(e.a, follow.a) AS a, e.b FROM e6 e JOIN edges follow ON e.a = follow.b WHERE e.a != e.b AND e.a != LEAST(e.a, follow.a)
), e8 AS (
SELECT LEAST(e.a, follow.a) AS a, e.b FROM e7 e JOIN edges follow ON e.a = follow.b WHERE e.a != e.b AND e.a != LEAST(e.a, follow.a)
), e9 AS (
SELECT LEAST(e.a, follow.a) AS a, e.b FROM e8 e JOIN edges follow ON e.a = follow.b WHERE e.a != e.b AND e.a != LEAST(e.a, follow.a)
), e10 AS (
SELECT LEAST(e.a, follow.a) AS a, e.b FROM e9 e JOIN edges follow ON e.a = follow.b WHERE e.a != e.b AND e.a != LEAST(e.a, follow.a)
)
SELECT
a AS cluster_id,
b AS node_id,
degrees
FROM (
SELECT a, a AS b, 0 AS degrees FROM edges
UNION ALL SELECT b AS a, b, 0 AS degrees FROM edges
UNION ALL SELECT *, 1 AS degrees FROM edges e WHERE a != b
UNION ALL SELECT *, 2 AS degrees FROM e1
UNION ALL SELECT *, 3 AS degrees FROM e2
UNION ALL SELECT *, 4 AS degrees FROM e3
UNION ALL SELECT *, 5 AS degrees FROM e4
UNION ALL SELECT *, 6 AS degrees FROM e5
UNION ALL SELECT *, 7 AS degrees FROM e6
UNION ALL SELECT *, 8 AS degrees FROM e7
UNION ALL SELECT *, 9 AS degrees FROM e8
UNION ALL SELECT *, 10 AS degrees FROM e9
UNION ALL SELECT *, 11 AS degrees FROM e10
)
WHERE TRUE
QUALIFY ROW_NUMBER() OVER (PARTITION BY b ORDER BY a, degrees) = 1
""";
END;
-- Example
CREATE TEMP TABLE edges AS SELECT * FROM UNNEST(CAST(
ARRAY[(0, 1), (1, 2), (2, 3), (2, 4), (4, 5), (0, 5), (6, 7), (7, 8)]
AS ARRAY<STRUCT<a INT64, b INT64>>
)) e;
CALL `mydataset`.connected_components_by_edge("edges", "a", "b", "clusters");
SELECT * FROM clusters ORDER BY cluster_id, node_id;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment