Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MarkEdmondson1234/990270f3190b5c675c895c7b066be82a to your computer and use it in GitHub Desktop.
Save MarkEdmondson1234/990270f3190b5c675c895c7b066be82a to your computer and use it in GitHub Desktop.
How to create a user mapping table (or Identity Graph) based on all the available user identifiers in the Google Analytics 4 BigQuery exports (like device-IDs or customer-IDs). Full article on stacktonic.com
-- Author: Krisjan Oldekamp
-- https://stacktonic.com/article/create-a-user-mapping-table-based-on-the-google-analytics-4-big-query-dataset
DECLARE
lookback_window INT64 DEFAULT 90; -- How many days to lookback into the dataset to search for IDs (compared to today)
-- Deduplicate array of struct
CREATE TEMP FUNCTION
DEDUP(val ANY TYPE) AS ((
SELECT
ARRAY_AGG(t)
FROM (
SELECT
MAX(v.timestamp) AS timestamp,
v.id
FROM
UNNEST(val) v
GROUP BY
v.id
ORDER BY
timestamp DESC
LIMIT
100) t ));
WITH
ga_user_ids AS (
-- Select all user-IDs from the GA4 dataset in the specified time period
SELECT
*
FROM (
SELECT
user_pseudo_id AS ga_client_id,
-- Device-ID
user_id AS customer_id,
-- Custom defined User-ID (e.g. Customer-ID)
# no user_properties in public GA4 dataset so this line breaks, will work in 'real' export
#(SELECT value.string_value FROM UNNEST(user_properties) WHERE key = "relay_id") AS customer_id_secondary, -- Secondary custom user-ID
'missing' AS customer_id_secondary,
(
SELECT
value.string_value
FROM
UNNEST(event_params)
WHERE
event_name = "page_view"
AND key = "gclid") AS gclid,
-- Click-ID
ecommerce.transaction_id AS transaction_id,
-- Order-IDs
MAX(event_timestamp) AS event_timestamp -- Timestamps
FROM
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE
_TABLE_SUFFIX BETWEEN FORMAT_DATE("%Y%m%d",DATE_SUB(CURRENT_DATE(), INTERVAL lookback_window DAY))
AND FORMAT_DATE("%Y%m%d",DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY))
GROUP BY
ga_client_id,
customer_id,
customer_id_secondary,
gclid,
transaction_id )
-- Filter out users without any additional user-IDs besides ga_client_id
WHERE
customer_id IS NOT NULL
OR customer_id_secondary IS NOT NULL
OR gclid IS NOT NULL
OR transaction_id IS NOT NULL )
-- Select all identifiers and group on customer-ID.
SELECT
customer_id,
MAX(TIMESTAMP_MICROS(event_timestamp)) AS timestamp_last_visit,
-- Aggegrate customer-IDs to an array of structs
DEDUP(ARRAY_AGG(
IF
(ga_client_id IS NOT NULL,
STRUCT(TIMESTAMP_MICROS(event_timestamp) AS timestamp,
ga_client_id AS id),
NULL ) IGNORE NULLS
ORDER BY
event_timestamp DESC )) AS ga_client_id,
-- Aggegrate secondary customer-IDs to an array of structs
DEDUP(ARRAY_AGG(
IF
(customer_id_secondary IS NOT NULL,
STRUCT(TIMESTAMP_MICROS(event_timestamp) AS timestamp,
customer_id_secondary AS id),
NULL ) IGNORE NULLS
ORDER BY
event_timestamp DESC )) AS customer_id_secondary,
-- Aggegrate GCLIDs to an array of structs
DEDUP(ARRAY_AGG(
IF
(gclid IS NOT NULL,
STRUCT(TIMESTAMP_MICROS(event_timestamp) AS timestamp,
gclid AS id),
NULL ) IGNORE NULLS
ORDER BY
event_timestamp DESC )) AS gclid,
-- Aggegrate transaction-IDs to an array of structs
DEDUP(ARRAY_AGG(
IF
(transaction_id IS NOT NULL,
STRUCT(TIMESTAMP_MICROS(event_timestamp) AS timestamp,
transaction_id AS id),
NULL ) IGNORE NULLS
ORDER BY
event_timestamp DESC )) AS transaction_id,
FROM
ga_user_ids
WHERE
customer_id IS NOT NULL
GROUP BY
customer_id
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment