Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save krisjan-oldekamp/b3b98ecc579fa40a8fdc48a84256a2de to your computer and use it in GitHub Desktop.
Save krisjan-oldekamp/b3b98ecc579fa40a8fdc48a84256a2de to your computer and use it in GitHub Desktop.
-- Author: Krisjan Oldekamp
-- https://stacktonic.com/article/enrich-a-single-customer-view-with-google-analytics-4-big-query-data
declare lookback_window int64 default 365; -- how many days to lookback into the ga4 dataset to calculate profiles
-- udf: channel grouping (you could put this in a permanent function)
-- also see https://stacktonic.com/article/google-analytics-4-and-big-query-create-custom-channel-groupings-in-a-reusable-sql-function
create temporary function channel_grouping(tsource string, medium string, campaign string) as (
case
when (tsource = '(direct)' or tsource is null)
and (regexp_contains(medium, r'^(\(not set\)|\(none\))$') or medium is null)
then 'direct'
when regexp_contains(medium, r'^(social|social_advertising|social-advertising|social_network|social-network|social_media|social-media)$')
then 'social'
when regexp_contains(medium, r'^(email|mail)$')
then 'email'
when regexp_contains(medium, r'^(affiliate|affiliates)$')
then 'affiliate'
when regexp_contains(medium, r'^(cpc|ppc|paidsearch)$')
then 'search_paid'
when regexp_contains(medium, r'^(display|cpm|banner)$')
then 'display'
when medium = 'organic'
then 'search_organic'
when medium = 'referral'
then 'referral'
else '(other)'
end
);
-- udf: most frequent value from array
create temp function most_freq(arr any type) as ((
select x from unnest(arr) x group by x order by count(1) desc limit 1
));
-- udf: get first channel within session
create temp function first_channel_session(arr any type) as ((
select
channel_grouping(x.source, x.medium, x.campaign)
from unnest(arr) x
where x.ignore_referrer is null
order by x.event_timestamp asc limit 1
));
with
-- this in-between table is based mapping table described in this article (you can leave out this part if you want to group on device-id only)
-- https://stacktonic.com/article/create-a-user-mapping-table-based-on-the-google-analytics-4-big-query-dataset)
-- only select ga_client_id (`user_pseudo_id`) with 3 of less customer-ids attached.
-- when multiple customer-ids, select most recent one.
customer_id_mapping as (
select
ga_client_id,
customer_ids,
(select x.customer_id from unnest(customer_ids) x order by x.timestamp desc limit 1) as customer_id
from (
select
ga_id.id as ga_client_id,
count(customer_id) as customer_ids_count,
array_agg(struct(customer_id, ga_id.timestamp)) as customer_ids,
from
`<your-project>.<your-dataset>.<your-mapping-table>`,
unnest(ga_client_id) as ga_id
group by
ga_client_id
)
where
customer_ids_count <= 3
),
-- aggegrate event level data to session level data and attach customer-id from mapping table
sessions as (
select
user_pseudo_id as ga_client_id,
max(m.customer_id) as customer_id, -- logic to select only one customer-id per user_pseudo_id can be found in `customer_id_mapping`
concat(user_pseudo_id,'.',(select cast(value.int_value as string) from unnest(event_params) where key = 'ga_session_id')) as session_id, -- combine user_pseudo_id and session_id for a unique session-id
timestamp_micros(min(event_timestamp)) as session_start,
max((select value.string_value from unnest(event_params) where key = 'session_engaged')) as session_engaged,
max((select value.int_value from unnest(event_params) where key = 'engagement_time_msec'))/1000 as engagement_time,
max(lower(traffic_source.source)) as traffic_source_first,
max(lower(traffic_source.medium)) as traffic_medium_first,
min(timestamp_micros(user_first_touch_timestamp)) as session_start_first_touch,
max(device.category) as device,
max(device.operating_system) as os,
max(device.web_info.browser) as browser,
max(geo.country) as country,
max(geo.city) as city,
ifnull(first_channel_session(
array_agg(
if(event_name in('page_view','user_engagement','scroll'), struct(
event_timestamp,
lower((select value.string_value from unnest(event_params) where key = 'source')) as source,
lower((select value.string_value from unnest(event_params) where key = 'medium')) as medium,
lower((select value.string_value from unnest(event_params) where key = 'campaign')) as campaign,
(select value.int_value from unnest(event_params) where key = 'entrances') as is_entrance,
(select value.int_value from unnest(event_params) where key = 'ignore_referrer') as ignore_referrer
), null)
ignore nulls order by event_timestamp asc)
), 'direct') as first_channel_in_session,
--countif(event_name = '<name-of-some-other-conversion-event>') as some_conversion,
if(countif(event_name = 'login') > 0,1,0) as session_login,
countif(event_name = 'purchase') as transactions,
sum(ecommerce.purchase_revenue) as transaction_value,
array_agg(
if(event_name = 'view_item' or event_name = 'add_to_cart', struct(
event_timestamp,
event_name,
(select item_id from unnest(items) limit 1) as item_id,
(select item_category from unnest(items) limit 1) as item_category
), null)
ignore nulls order by event_timestamp desc limit 100) as item_interactions
from
`<your-project>.analytics_<your-ga4-property-id>.events_*` as s
left join customer_id_mapping as m
on m.ga_client_id = s.user_pseudo_id
where
_table_suffix between format_date('%Y%m%d',date_sub(current_date(), interval lookback_window day))
and format_date('%Y%m%d',date_sub(current_date(), interval 1 day))
group by
user_pseudo_id,
session_id
)
-- aggegrate to user level
select
* except (item_interactions),
array_to_string(
array(select x.item_id from unnest(item_interactions) x where x.event_name = 'view_item' group by x.item_id order by max(x.event_timestamp) desc limit 5),
'|'
) as last_viewed_items,
array_to_string(
array(select x.item_id from unnest(item_interactions) x where x.event_name = 'view_item' group by x.item_id order by count(1) desc limit 5),
'|'
) as most_viewed_items,
array_to_string(
array(select x.item_category from unnest(item_interactions) x where x.event_name = 'view_item' group by x.item_category order by count(1) desc limit 5),
'|'
) as most_viewed_categories
from (
select
--ga_client_id, -- uncomment when grouping on device-id instead of customer-id
customer_id, -- remove when grouping on device-id
min(datetime(session_start_first_touch)) as first_touch,
min(datetime(session_start)) as first_session,
max(datetime(session_start)) as last_session,
max(datetime(if(transactions > 0, session_start, null))) as last_transaction,
count(session_id) as sessions,
countif(date_diff(current_date, datetime(session_start), day) between 0 and 7) as sessions_1d_7d,
countif(date_diff(current_date, datetime(session_start), day) between 8 and 30) as sessions_7d_30d,
countif(date_diff(current_date, datetime(session_start), day) between 31 and 90) as sessions_30d_90d,
countif(session_engaged = '1') as sessions_engaged,
safe_divide(sum(engagement_time), countif(session_engaged = '1')) as engagement_time_avg,
sum(session_login) as logins,
sum(transactions) as total_transactions,
ifnull(sum(transaction_value),0) as total_transaction_value,
count(distinct device) as devices,
count(distinct first_channel_in_session) as channels,
date_diff(current_date(),cast((max(session_start)) as date),day) as days_since_last_session,
date_diff(current_date(),cast((max(if(transactions > 0, session_start, null))) as date),day) as days_since_last_transaction,
string_agg(device order by session_start asc limit 1) as device_first,
string_agg(device order by session_start desc limit 1) as device_last,
most_freq(array_agg(device)) as device_most_freq,
string_agg(os order by session_start desc limit 1) as os_last,
most_freq(array_agg(os)) as os_most_freq,
string_agg(browser order by session_start desc limit 1) as browser_last,
most_freq(array_agg(browser)) as browser_most_freq,
channel_grouping(min(traffic_source_first), min(traffic_medium_first), null) as channel_acquisition,
string_agg(first_channel_in_session order by session_start asc limit 1) as channel_first,
string_agg(first_channel_in_session order by session_start desc limit 1) as channel_last,
most_freq(array_agg(first_channel_in_session ignore nulls)) as channel_most_freq,
array_concat_agg(item_interactions limit 200) as item_interactions
from
sessions
where -- remove when grouping on device-id
customer_id is not null -- remove when grouping on device-id
group by
customer_id -- remove when grouping on device-id
--ga_client_id --uncomment when grouping on device-id instead of customer-id
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment