Last active
February 28, 2023 14:29
-
-
Save bindiego/934d77b3597da0d02547b4d24db188a7 to your computer and use it in GitHub Desktop.
BigQuery Deduplication
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
with n_dups as | |
( | |
SELECT event_name, event_timestamp, user_pseudo_id, count(1)-1 as n_duplicates | |
FROM `project.dataset.events_20190610` | |
group by event_name, event_timestamp, user_pseudo_id | |
) | |
select n_duplicates, count(1) as n_cases | |
from n_dups | |
group by n_duplicates | |
order by n_cases desc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Check duplications | |
SELECT | |
date, | |
COUNT(DISTINCT v) AS unique_values, | |
COUNT(*) AS value | |
FROM | |
`transactions.testdata` | |
GROUP BY | |
1; | |
-- Simple way | |
CREATE OR REPLACE TABLE `transactions.testdata` | |
PARTITION BY date | |
AS SELECT DISTINCT * FROM `transactions.testdata`; | |
-- Use merge | |
MERGE `transactions.testdata` t | |
USING ( | |
SELECT DISTINCT * | |
FROM `transactions.testdata` | |
WHERE date=CURRENT_DATE() | |
) | |
ON FALSE | |
WHEN NOT MATCHED BY SOURCE AND date=CURRENT_DATE() THEN DELETE | |
WHEN NOT MATCHED BY TARGET THEN INSERT ROW | |
-- Discint on 2 columns as an example | |
CREATE OR REPLACE TABLE `transactions.testdata` | |
AS | |
SELECT * EXCEPT(dup) | |
FROM ( | |
SELECT *, | |
ROW_NUMBER() OVER(PARTITION BY amount, timestamp) AS dup | |
FROM | |
`transactions.testdata` | |
) | |
WHERE dup = 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment