Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save krisjan-oldekamp/0a1a1c0085e6e6f68306f184ed1af4a1 to your computer and use it in GitHub Desktop.
Save krisjan-oldekamp/0a1a1c0085e6e6f68306f184ed1af4a1 to your computer and use it in GitHub Desktop.
How to create a user mapping table (or Identity Graph) based on all the available user identifiers in the Google Analytics 4 BigQuery exports (like device-IDs or customer-IDs). Full article on stacktonic.com
-- Author: Krisjan Oldekamp
-- https://stacktonic.com/article/create-a-user-mapping-table-based-on-the-google-analytics-4-big-query-dataset
declare lookback_window int64 default 90; -- how many days to lookback into the dataset to search for ids (compared to today)
-- udf: deduplicate array of struct
create temp function dedup(arr any type) as ((
select
array_agg(t)
from (
select max(a.timestamp) as timestamp, a.id from unnest(arr) a group by a.id order by timestamp desc limit 100
) t
));
with ga_user_ids as (
-- select all user-ids from the ga4 dataset in the specified time period
select
*
from (
select
user_pseudo_id as ga_client_id, -- device-id
user_id as customer_id, -- custom defined user-id (e.g. customer-id)
(select value.string_value from unnest(user_properties) where key = 'relay_id') as customer_id_secondary, -- secondary custom user-id
(select value.string_value from unnest(event_params) where event_name = 'page_view' and key = 'gclid') as gclid, -- click-id
ecommerce.transaction_id as transaction_id, -- order-ids
max(event_timestamp) as event_timestamp -- timestamps
from `<your-project>.analytics_<your-dataset>.events_*`
where
_table_suffix between format_date('%Y%m%d',date_sub(current_date(), interval lookback_window day))
and format_date('%Y%m%d',date_sub(current_date(), interval 1 day))
group by
1,2,3,4,5
)
-- filter out users without any additional user-ids besides ga_client_id
where
customer_id is not null
or customer_id_secondary is not null
or gclid is not null
or transaction_id is not null
)
-- select all identifiers and group on customer-id.
select
customer_id,
max(timestamp_micros(event_timestamp)) as timestamp_last_visit,
-- aggegrate customer-ids to an array of structs
dedup(array_agg(
if(ga_client_id is not null,
struct(timestamp_micros(event_timestamp) as timestamp, ga_client_id as id),
null
) ignore nulls order by event_timestamp desc
)) as ga_client_id,
-- aggegrate secondary customer-ids to an array of structs
dedup(array_agg(
if(customer_id_secondary is not null,
struct(timestamp_micros(event_timestamp) as timestamp, customer_id_secondary as id),
null
)ignore nulls order by event_timestamp desc
)) as customer_id_secondary,
-- aggegrate gclids to an array of structs
dedup(array_agg(
if(gclid is not null,
struct(timestamp_micros(event_timestamp) as timestamp, gclid as id),
null
) ignore nulls order by event_timestamp desc
)) as gclid,
-- aggegrate transaction-ids to an array of structs
dedup(array_agg(
if(transaction_id is not null,
struct(timestamp_micros(event_timestamp) as timestamp, transaction_id as id),
null
) ignore nulls order by event_timestamp desc
)) as transaction_id,
from ga_user_ids
where
customer_id is not null
group by 1
@MarkEdmondson1234
Copy link

I think the last few lines should read:

FROM
ga_user_ids
WHERE customer_id IS NOT NULL
GROUP BY
customer_id

@krisjan-oldekamp
Copy link
Author

I think the last few lines should read:

FROM ga_user_ids WHERE customer_id IS NOT NULL GROUP BY customer_id

You're right, last minute changes... I've updated the snippet. Thanks!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment