Skip to content

Instantly share code, notes, and snippets.

@bindiego
Last active February 28, 2023 14:29
Show Gist options
  • Save bindiego/934d77b3597da0d02547b4d24db188a7 to your computer and use it in GitHub Desktop.
Save bindiego/934d77b3597da0d02547b4d24db188a7 to your computer and use it in GitHub Desktop.
BigQuery Deduplication
with n_dups as
(
SELECT event_name, event_timestamp, user_pseudo_id, count(1)-1 as n_duplicates
FROM `project.dataset.events_20190610`
group by event_name, event_timestamp, user_pseudo_id
)
select n_duplicates, count(1) as n_cases
from n_dups
group by n_duplicates
order by n_cases desc
-- Check duplications
SELECT
date,
COUNT(DISTINCT v) AS unique_values,
COUNT(*) AS value
FROM
`transactions.testdata`
GROUP BY
1;
-- Simple way
CREATE OR REPLACE TABLE `transactions.testdata`
PARTITION BY date
AS SELECT DISTINCT * FROM `transactions.testdata`;
-- Use merge
MERGE `transactions.testdata` t
USING (
SELECT DISTINCT *
FROM `transactions.testdata`
WHERE date=CURRENT_DATE()
)
ON FALSE
WHEN NOT MATCHED BY SOURCE AND date=CURRENT_DATE() THEN DELETE
WHEN NOT MATCHED BY TARGET THEN INSERT ROW
-- Discint on 2 columns as an example
CREATE OR REPLACE TABLE `transactions.testdata`
AS
SELECT * EXCEPT(dup)
FROM (
SELECT *,
ROW_NUMBER() OVER(PARTITION BY amount, timestamp) AS dup
FROM
`transactions.testdata`
)
WHERE dup = 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment