Skip to content

Instantly share code, notes, and snippets.

@hui-zheng
Last active May 26, 2023 14:14
Show Gist options
  • Save hui-zheng/f7e972bcbe9cde0c6cb6318f7270b67a to your computer and use it in GitHub Desktop.
Save hui-zheng/f7e972bcbe9cde0c6cb6318f7270b67a to your computer and use it in GitHub Desktop.
This list provides BigQuery SQL templates that remove duplicates for large size timestamp partitioned table (using MERGE statement) and for small size table or a non-partition table (Using REPLACE TABLE statement)
-- WARNING: back up the table before this operation
-- FOR large size timestamp partitioned table
-- -------------------------------------------
-- -- To de-duplicate rows of a given range of a partition table, using surrage_key as unique id
-- -------------------------------------------
DECLARE dt_start DEFAULT TIMESTAMP("2019-09-17T00:00:00", "America/Los_Angeles") ;
DECLARE dt_end DEFAULT TIMESTAMP("2019-09-22T00:00:00", "America/Los_Angeles");
MERGE INTO `gcp_project`.`data_set`.`the_table` AS INTERNAL_DEST
USING (
SELECT k.*
FROM (
SELECT ARRAY_AGG(original_data LIMIT 1)[OFFSET(0)] k
FROM `gcp_project`.`data_set`.`the_table` AS original_data
WHERE stamp BETWEEN dt_start AND dt_end
GROUP BY surrogate_key
)
) AS INTERNAL_SOURCE
ON FALSE
WHEN NOT MATCHED BY SOURCE
AND INTERNAL_DEST.stamp BETWEEN dt_start AND dt_end -- remove all data in partiion range
THEN DELETE
WHEN NOT MATCHED THEN INSERT ROW
-- FOR a small size table or a non-partition table
-------------------------------------------
-- To de-duplicate rows of the whole table using surrage_key as unique id
-------------------------------------------
CREATE OR REPLACE TABLE `semios-dbt`.`data_set`.`pois`
AS
SELECT k.*
FROM (
SELECT ARRAY_AGG(original_data LIMIT 1)[OFFSET(0)] k
FROM `gcp_project`.`data_set`.`the_table` AS original_data
GROUP BY surrogate_key
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment