Skip to content

Instantly share code, notes, and snippets.

@saisrirampur
Last active February 5, 2024 03:25
Show Gist options
  • Select an option

  • Save saisrirampur/22b3399077cf8490301039aa5ca7ab94 to your computer and use it in GitHub Desktop.

Select an option

Save saisrirampur/22b3399077cf8490301039aa5ca7ab94 to your computer and use it in GitHub Desktop.
-- GROUP 1: Unclustered staging and raw tables
CREATE TABLE sai_tests.unclustered (
id INT64,
column_string STRING,
column_int INT64,
column_float FLOAT64,
column_timestamp TIMESTAMP
);
INSERT INTO sai_tests.unclustered WITH number_series AS (
SELECT ROW_NUMBER() OVER() AS id
FROM UNNEST(GENERATE_ARRAY(1, 100000)) AS a,
UNNEST(GENERATE_ARRAY(1, 2000)) AS b
)
SELECT
id,
CONCAT('string_', CAST(id AS STRING)) AS column_string,
CAST(id * 2 AS INT64) AS column_int,
CAST(id * 1.5 AS FLOAT64) AS column_float,
TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL id SECOND) AS column_timestamp
FROM number_series;
CREATE TABLE sai_tests.unclustered_stage (
id INT64,
column_string STRING,
column_int INT64,
column_float FLOAT64,
column_timestamp TIMESTAMP
);
INSERT INTO sai_tests.unclustered_stage WITH number_series AS (
SELECT ROW_NUMBER() OVER() AS id
FROM UNNEST(GENERATE_ARRAY(1, 10000)) AS a,
UNNEST(GENERATE_ARRAY(1, 1000)) AS b
)
SELECT
id,
CONCAT('new_string_', CAST(id AS STRING)) AS column_string,
CAST(id * 3 AS INT64) AS column_int,
CAST(id * 2.5 AS FLOAT64) AS column_float,
TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL id MINUTE) AS column_timestamp
FROM number_series
WHERE id <= 10000000;
MERGE sai_tests.unclustered AS final
USING (
SELECT * FROM sai_tests.unclustered_stage
WHERE id > 500000 AND id < 510000
) AS staging ON
final.id = staging.id
WHEN MATCHED THEN
UPDATE SET final.column_string = staging.column_string, final.column_int = staging.column_int, final.column_float = staging.column_float, final.column_timestamp = staging.column_timestamp
WHEN NOT MATCHED THEN
INSERT (id, column_string, column_int, column_float, column_timestamp)
VALUES (staging.id, staging.column_string, staging.column_int, staging.column_float, staging.column_timestamp);
-- GROUP 1: Clustered staging and final tables
CREATE TABLE sai_tests.clustered (
id INT64,
column_string STRING,
column_int INT64,
column_float FLOAT64,
column_timestamp TIMESTAMP
) CLUSTER BY id AS
SELECT * FROM sai_tests.unclustered;
CREATE TABLE sai_tests.clustered_stage (
id INT64,
column_string STRING,
column_int INT64,
column_float FLOAT64,
column_timestamp TIMESTAMP
) CLUSTER BY id AS
SELECT * FROM sai_tests.unclustered_stage;
MERGE sai_tests.clustered AS final
USING (
SELECT * FROM sai_tests.clustered_stage
WHERE id > 500000 AND id < 510000
) AS staging
ON final.id = staging.id
WHEN MATCHED THEN
UPDATE SET final.column_string = staging.column_string, final.column_int = staging.column_int, final.column_float = staging.column_float, final.column_timestamp = staging.column_timestamp
WHEN NOT MATCHED THEN
INSERT (id, column_string, column_int, column_float, column_timestamp)
VALUES (staging.id, staging.column_string, staging.column_int, staging.column_float, staging.column_timestamp);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment