Skip to content

Instantly share code, notes, and snippets.

@khunreus
Created July 28, 2022 05:53
Show Gist options
  • Save khunreus/eeeaa966c7983abfe62760de77cf7488 to your computer and use it in GitHub Desktop.
Save khunreus/eeeaa966c7983abfe62760de77cf7488 to your computer and use it in GitHub Desktop.
The query pulls page_view event data from the Google BigQuery public dataset for ecommerce (GA4). The implementation goes around using traditional UNNEST approach as it can cause missing rows when a CROSS JOIN is applied to a table with NULL values.
-- the query pulls page_view event data from the Google BigQuery public dataset for ecommerce (GA4)
-- the implementation with array aggregation is addressing an issue raised here https://stackoverflow.com/questions/44918108/google-bigquery-i-lost-null-row-when-using-unnest-function
-- stackoverflow thread is exploring a different solution
WITH base_table AS (
-- pulls relevant columns from relevant dates to decrease the size of data scanned
SELECT
event_name,
event_date,
event_timestamp,
user_pseudo_id,
user_id,
device,
geo,
traffic_source,
event_params,
user_properties
FROM
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE
_table_suffix >= '20210101'
AND event_name IN (
'page_view'
)
)
, unnested_events AS (
-- unnests event parameters to get to relevant keys and values
SELECT
event_date AS date,
event_timestamp AS event_timestamp_microseconds,
user_pseudo_id,
ARRAY_AGG(DISTINCT ARRAY(
SELECT DISTINCT
c.value.int_value AS ga_session_id
FROM
UNNEST(event_params) c
WHERE c.key = 'ga_session_id'
AND c.value.int_value is NOT NULL
GROUP BY 1)[SAFE_OFFSET(0)]
IGNORE NULLS)[SAFE_OFFSET(0)] visitID,
ARRAY_AGG(DISTINCT ARRAY(
SELECT DISTINCT
c.value.int_value AS ga_session_number
FROM
UNNEST(event_params) c
WHERE c.key = 'ga_session_number'
AND c.value.int_value is NOT NULL
GROUP BY 1)[SAFE_OFFSET(0)]
IGNORE NULLS)[SAFE_OFFSET(0)] visitNumber
,ARRAY_AGG(DISTINCT ARRAY(
SELECT DISTINCT
c.value.string_value AS page_title
FROM
UNNEST(event_params) c
WHERE c.key = 'page_title'
GROUP BY 1)[SAFE_OFFSET(0)]
IGNORE NULLS)[SAFE_OFFSET(0)] page_title
FROM
base_table
GROUP BY 1,2,3
)
, ranked_screens AS (
-- prepares additional data points for analytics to understand transitions between the previous, current and following pages
SELECT
*,
DENSE_RANK() OVER (PARTITION BY user_pseudo_id, visitID ORDER BY event_timestamp_microseconds ASC) page_rank,
LAG(page_title,1) OVER (PARTITION BY user_pseudo_id, visitID ORDER BY event_timestamp_microseconds ASC) previous_page,
LEAD(page_title,1) OVER (PARTITION BY user_pseudo_id, visitID ORDER BY event_timestamp_microseconds ASC) next_page
FROM
unnested_events
--WHERE
-- length(trim(page_title)) > 0
)
, screen_summary AS (
-- another layer of analytics: check the last page number viewed on a session
-- aggregate all screens per session, which will be helpful in identifying power users
SELECT
*,
MAX(page_rank) OVER (PARTITION BY user_pseudo_id, visitID) last_page_rank,
ARRAY_AGG(page_title) OVER (PARTITION BY user_pseudo_id, visitID) pages_on_a_visit
FROM
ranked_screens
)
, screen_summary_agg AS (
-- cleans up pages_on_a_visit field
SELECT * EXCEPT(
pages_on_a_visit
),
ARRAY_TO_STRING(
ARRAY(
SELECT DISTINCT
*
FROM
UNNEST(pages_on_a_visit)
ORDER BY 1 ASC
)
, '||') AS screens_on_a_visit
FROM
screen_summary
)
SELECT
-- final analytical point: the first time a page is viewed in a session
*,
MIN(page_rank) OVER (PARTITION BY user_pseudo_id, visitID, page_title) AS page_rank_session_first_seen
FROM
screen_summary_agg
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment