Skip to content

Instantly share code, notes, and snippets.

@roshanlabh
Forked from OrenBochman/0_BQ_description.md
Created January 29, 2021 05:12
Show Gist options
  • Save roshanlabh/dd6a034c8fb38ac0d29bec2d7cff94a5 to your computer and use it in GitHub Desktop.
Save roshanlabh/dd6a034c8fb38ac0d29bec2d7cff94a5 to your computer and use it in GitHub Desktop.
Big Query for GA analytics

BigQuery for Google Analytics

No metrics in

Self Joins

  • Cross Join example
  • Inner Join example

joins can be inefficent - they can require moving lots of data around and increase the data process from n to n * n

Windowing Functions

-- Count the Number of Sessions Generated by Organic Searches
SELECT COUNT(totals.visits) AS visitCount
FROM [google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910]
WHERE trafficSource.medium = "organic"
SELECT COUNT(totals.bounces)/COUNT(totals.visits) AS bounceRate
FROM [google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910]
-- from: http://www.lunametrics.com/blog/2016/02/11/querying-google-analytics-data-in-bigquery/
SELECT HOUR(SEC_TO_TIMESTAMP(visitStartTime)) AS sessionHour,
COUNT(totals.bounces)/COUNT(totals.visits) AS bounceRate
FROM [google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910]
GROUP BY sessionHour
-- from: http://www.lunametrics.com/blog/2016/02/11/querying-google-analytics-data-in-bigquery/
-- Per-hour bounce rate calculation
SELECT HOUR(SEC_TO_TIMESTAMP(visitStartTime)) AS sessionHour, -- Extracts the hour from the unix timestamp
-- COUNT is an aggregate function and will only count non-NULL values
COUNT(totals.bounces)/COUNT(totals.visits) AS bounceRate -- Computes the bounce rate. Don't use AVERAGE! it ignores NULLs
-- Let's just use the one example table
FROM [google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910]
GROUP BY sessionHour
-- from: http://www.lunametrics.com/blog/2016/02/11/querying-google-analytics-data-in-bigquery/
SELECT trafficSource.medium AS medium,
COUNT(totals.bounces)/COUNT(totals.visits) AS bounceRate
FROM [google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910]
GROUP BY medium
-- from: http://www.lunametrics.com/blog/2016/02/11/querying-google-analytics-data-in-bigquery/
-- Mediums with the Most Sessions per Hour
SELECT trafficSource.medium AS medium,
HOUR(SEC_TO_TIMESTAMP(visitStartTime)) AS sessionHour,
COUNT(totals.visits) AS sessionCount,
COUNT(totals.bounces)/COUNT(totals.visits) AS bounceRate
FROM [google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910]
GROUP BY medium, sessionHour
ORDER BY sessionHour, sessionCount DESC
-- from: http://www.lunametrics.com/blog/2016/02/11/querying-google-analytics-data-in-bigquery/
SELECT date,
SUM(totals.visits) AS sessions
FROM [google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910]
GROUP BY date
--http://www.lunametrics.com/blog/2016/06/23/google-analytics-bigquery-export-schema/
SELECT date,
SUM(totals.visits) AS sessions
FROM [google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910]
GROUP BY date
HAVING sessions > 70
-- http://www.lunametrics.com/blog/2016/06/23/google-analytics-bigquery-export-schema/
SELECT date,
SUM(totals.visits) AS sessions
FROM [google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910]
WHERE totals.transactions > 0
GROUP BY date
-- http://www.lunametrics.com/blog/2016/06/23/google-analytics-bigquery-export-schema/
SELECT trafficSource.medium,
SUM(totals.newVisits) / SUM(totals.visits) AS percentNewSessions
FROM [google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910]
GROUP BY trafficSource.medium
SELECT
CONCAT(trafficSource.source,"/",trafficSource.medium) AS sourceMedium,
newSessions / sessions AS percentNewSessions,
bounces / sessions AS bounceRate
FROM (
SELECT
trafficSource.source,
trafficSource.medium,
SUM(totals.visits) AS sessions,
SUM(totals.newVisits) AS newSessions,
SUM(totals.bounces) AS bounces,
FROM
[google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910]
GROUP BY
trafficSource.source,
trafficSource.medium)
SELECT
trafficSource.medium,
SUM(totals.visits) AS sessions,
-- 5 will give the min, 25%, 50%, 75%, max with 20% error
-- the more buckets, the better the approximation (error = 1/number of buckets)
-- at the cost of more computation
-- QUANTILES returns all of the buckets, use NTH to extract the bucket you want
NTH(2, QUANTILES(totals.timeOnSite, 5)) AS firstQuartile,
NTH(3, QUANTILES(totals.timeOnSite, 5)) AS mean,
NTH(3, QUANTILES(totals.timeOnSite, 5)) AS thirdQuartile
FROM (
SELECT
trafficSource.medium,
totals.visits,
-- Sessions with a single page view will have no time on site reported
IF(totals.timeOnSite IS NULL, 0, totals.timeOnSite) AS totals.timeOnSite
FROM
[google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910])
GROUP BY
trafficSource.medium
-- of people who viewed one specific page (pageA)
-- which other pages (pageB) did they view and how many times (crossCount) ?
SELECT
sp as pageA,
pp as pageB,
COUNT(*) AS crossCount
FROM (
SELECT
sess.fullvisitorid AS sfvi,
pages.fullVisitorid AS pfvi,
sess.hits.page.pagePath AS sp,
pages.hits.page.pagePath AS pp
FROM
FLATTEN([google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910], hits) AS sess
CROSS JOIN
FLATTEN([google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910], hits) AS pages
WHERE
-- ensures each combo is only counted a single time
pages.fullvisitorid < sess.fullvisitorid
AND pages.hits.page.pagePath < sess.hits.page.pagePath
GROUP BY sfvi, pfvi, sp, pp )
GROUP BY pageA, pageB
ORDER BY pageA, crossCount DESC
-- from - http://www.lunametrics.com/blog/2016/05/12/self-joins-windowing-user-defined-functions-bigquery/
- find the the page viewed before a page,
- we could find all related pages in a session using a self-join,
- filter out, using a WHERE clause because in BigQuery join conditions, in the ON, cannot have inequalities,
SELECT
cur_hit.fullVisitorId AS fullVisitorId,
cur_hit.visitId AS visitID,
cur_hit.hits.hitNumber AS cur_hitnumber,
cur_hit.hits.page.pagePath as cur_pagePath,
cur_hit.hits.time AS cur_time,
MAX(prev_hit.hits.hitNumber) AS prev_hitNumber,
FROM
FLATTEN([google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910], hits) AS prev_hit
INNER JOIN
FLATTEN([google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910], hits) AS cur_hit
ON
prev_hit.fullvisitorid = cur_hit.fullvisitorid
AND prev_hit.visitid = cur_hit.visitid
WHERE
prev_hit.hits.hitNumber < cur_hit.hits.hitNumber
AND prev_hit.hits.type = "PAGE"
AND cur_hit.hits.type = "PAGE"
GROUP BY fullVisitorId, visitID, cur_hitnumber, cur_pagePath, cur_time
-- http://www.lunametrics.com/blog/2016/05/12/self-joins-windowing-user-defined-functions-bigquery/
  • Self-Joins are when you join a table to itself,
  • use to find related rows in a table
  • provides flexibility,
  • joins in BigQuery are inefficient (the larger the “smaller” table becomes, the more data needs to be shipped between nodes)
  • a join may require "multipliying" two tables - in big query there is also an issue of moving the data between nodes)
-- using windowing to find time on page
-- COUNT(num) OVER (PARTITION BY num ) as Var I
-- different windowing functions can be used in the same query
SELECT
fullVisitorId,
visitId,
hits.hitNumber,
hits.page.pagePath,
-- get next and previous hit time to be able to work out length of each hit
hits.time AS hit_time,
LEAD(hits.time, 1) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hits.time ASC) AS next_hit_time,
FROM
[google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910]
WHERE
hits.type = "PAGE"
SELECT
fullVisitorId,
visitId,
hitNumber,
pagePath,
diff AS timeOnPage_ms
FROM
timePerPage(
SELECT
fullVisitorId,
visitId,
NEST(hits.page.pagePath) AS pages,
NEST(hits.type) AS types,
NEST(hits.time) AS times
FROM
[google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910]
GROUP BY
fullVisitorId,
visitId
LIMIT
10)
SELECT
COUNT(*) as logins_after_product_view
FROM (
SELECT
SUM(IF(hits.eventinfo.eventCategory = "Login", 1, 0)) OVER (PARTITION BY fullvisitorid, visitid ROWS BETWEEN 1 FOLLOWING AND 3 FOLLOWING) AS login_count,
hits.page.pagePath as pagePath
FROM
[google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910] )
WHERE
login_count > 0
AND (pagePath CONTAINS 'vests' OR pagePath CONTAINS 'helments')
-- from: http://www.lunametrics.com/blog/2016/02/11/querying-google-analytics-data-in-bigquery/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment