Skip to content

Instantly share code, notes, and snippets.

@michalbrys
Last active December 6, 2023 14:41
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save michalbrys/ed4bc7060189f29241e92bdc40d32119 to your computer and use it in GitHub Desktop.
Save michalbrys/ed4bc7060189f29241e92bdc40d32119 to your computer and use it in GitHub Desktop.
BQML #1 - Train your first model
--count unique users number in January
SELECT
COUNT(DISTINCT user_pseudo_id) AS users
FROM
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*`
WHERE
_TABLE_SUFFIX BETWEEN '20210101' AND '20210131'
-- PLEASE REPLACE WITH YOUR DESIRED DATE RANGE
-- AND _TABLE_SUFFIX BETWEEN '20180501' AND '20240131'`
CREATE TABLE bqmlhackathon.ga4.ga4_sample_ecommerce AS
-- sessions enhanced by users data
SELECT s.fullVisitorId, s.visitStartTime, s.date, s.deviceCategory, s.isMobile, s.operatingSystem, s.browser,
s.country, s.city, s.trafficSource, s.trafficMedium, s.trafficCampaign, s.isFirstVisit, s.productPagesViewed, s.addedToCart,
u.totalHits, u.totalPageviews, u_visits.totalVisits, u_engagement.totalTimeOnSite
FROM (
-- sessions
SELECT fullVisitorId, ga_session_id,
MAX(visitStartTime) AS visitStartTime,
date, deviceCategory, isMobile, operatingSystem, browser,
country, city, trafficSource, trafficMedium, trafficCampaign,
MAX(isFirstVisit) AS isFirstVisit,
MAX(productPagesViewed) AS productPagesViewed,
MAX(addedToCart) AS addedToCart
FROM (
-- raw events
SELECT
user_pseudo_id AS fullVisitorId,
ep.value.int_value AS ga_session_id,
CASE event_name WHEN 'session_start' THEN event_timestamp ELSE NULL END AS visitStartTime,
event_date AS date,
device.category AS deviceCategory,
CASE device.category WHEN 'mobile' THEN 1 ELSE 0 END AS isMobile,
device.operating_system AS operatingSystem,
device.web_info.browser AS browser,
geo.country AS country,
geo.city AS city,
traffic_source.source AS trafficSource,
traffic_source.medium AS trafficMedium,
CASE traffic_source.medium WHEN 'cpc' THEN traffic_source.name ELSE NULL END AS trafficCampaign,
CASE event_name WHEN 'first_visit' THEN 1 ELSE 0 END AS isFirstVisit,
CASE event_name WHEN 'view_item' THEN 1 ELSE 0 END AS productPagesViewed,
CASE event_name WHEN 'add_to_cart' THEN 1 ELSE 0 END AS addedToCart
FROM
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_202101*` e
CROSS JOIN
UNNEST(e.event_params) ep
WHERE ep.key = 'ga_session_id'
) events
GROUP BY fullVisitorId, ga_session_id, date, deviceCategory, isMobile,
operatingSystem, browser, country, city, trafficSource, trafficMedium, trafficCampaign
) s
LEFT JOIN (
-- totalHits, totalPageviews
SELECT user_pseudo_id AS fullVisitorId,
COUNT(user_pseudo_id) AS totalHits,
COUNTIF(event_name = 'page_view') AS totalPageviews,
FROM
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_202101*` e
GROUP BY user_pseudo_id
) u USING(fullVisitorId)
LEFT JOIN (
-- totalVisits
-- count unique `ga_session_id`
SELECT user_pseudo_id AS fullVisitorId,
COUNT(DISTINCT ep.value.int_value) AS totalVisits
FROM
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_202101*` e
CROSS JOIN
UNNEST(e.event_params) ep
WHERE ep.key = 'ga_session_id'
GROUP BY user_pseudo_id
) u_visits USING(fullVisitorId)
LEFT JOIN (
-- totalTimeOnSite (divide by 1000 to get seconds)
SELECT user_pseudo_id AS fullVisitorId,
MAX(engagement_time_msec) AS totalTimeOnSite
FROM (
SELECT
user_pseudo_id,
value.int_value AS engagement_time_msec
FROM
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_202101*` e
CROSS JOIN
UNNEST(e.event_params) ep WHERE key = 'engagement_time_msec'
) s
GROUP BY user_pseudo_id
) u_engagement USING(fullVisitorId)
CREATE OR REPLACE MODEL
bqmlhackathon.ga4.init_model_logistic_reg
TRANSFORM(
* EXCEPT(
fullVisitorId, --don't train on user_id
isFirstVisit, --we'll change the feature type below
date --don't train on date
)
,CAST(isFirstVisit AS bool) AS isFirstVisit
--change feature type to bool
)
OPTIONS(MODEL_TYPE = 'LOGISTIC_REG',
INPUT_LABEL_COLS = ['addedToCart']
)
AS
SELECT
*
FROM
`bqmlhackathon.ga4.ga4_sample_ecommerce`
#standardSQL
SELECT *
FROM ML.WEIGHTS(MODEL `bqmlhackathon.ga4.init_model_logistic_reg`)
SELECT
sessionId,
prob as addedToCart
FROM
ML.PREDICT(MODEL `bqmlhackathon.ga4.init_model_logistic_reg`,
(
SELECT *,CONCAT(fullVisitorId, CAST(visitStartTime as string)) as sessionId
FROM `bqmlhackathon.ga4.ga4_sample_ecommerce_20201231`)
),
UNNEST(predicted_addedToCart_probs)
WHERE label = 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment