Skip to content

Instantly share code, notes, and snippets.

@jklukas
Created January 16, 2020 19:15
Show Gist options
  • Save jklukas/6f948d66789e211e7a4470bd82e84cf4 to your computer and use it in GitHub Desktop.
Save jklukas/6f948d66789e211e7a4470bd82e84cf4 to your computer and use it in GitHub Desktop.
WITH
-- Distinct document_ids and their minimum submission_timestamp today
-- not including document_ids that only occur on or after @end_time
distinct_document_ids AS (
SELECT
document_id,
MIN(submission_timestamp) AS submission_timestamp
FROM
`moz-fx-data-shared-prod.telemetry_live.main_v4`
WHERE
DATE(submission_timestamp) >= DATE_SUB(
DATE(@start_time),
INTERVAL @num_preceding_days DAY
)
AND submission_timestamp < @end_time
GROUP BY
document_id
HAVING
submission_timestamp >= @start_time),
-- A single slice of a live ping table.
base AS (
SELECT
*
FROM
`moz-fx-data-shared-prod.telemetry_live.main_v4`
JOIN
distinct_document_ids
USING
-- Retain only the first seen documents for each ID, according to timestamp.
(document_id, submission_timestamp)
WHERE
submission_timestamp >= @start_time
AND submission_timestamp < @end_time),
--
-- Order documents by assigning a row number.
numbered_duplicates AS (
SELECT
*,
ROW_NUMBER() OVER (PARTITION BY document_id) AS _n
FROM
base)
--
-- Retain only one document for each ID.
SELECT
* EXCEPT(_n)
FROM
numbered_duplicates
WHERE
_n = 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment