Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save markrittman/088de6b260dbc4bc04290d14bb0e846f to your computer and use it in GitHub Desktop.
Save markrittman/088de6b260dbc4bc04290d14bb0e846f to your computer and use it in GitHub Desktop.
Example BigQuery SQL statement to stitch user identities together across multiple Segment and other sources based on inferred connections
with
meeting_bookings as # meetings have names and email addresses, but no anonymous_id
(
select
# union of three events that are logically the same but were recorded as three different events historically
id,
timestamp,
email as anonymous_id,
cast(null as string) as context_ip,
cast(null as string) as context_page_path,
cast(null as string) as context_page_referrer,
cast(null as string) as context_page_title,
cast(null as string) as context_user_agent,
'booked meeting' as event_text,
cast(null as string) as target,
event_text as intent,
email as email,
split(event_booker,' ')[safe_offset(0)] as first_name,
split(event_booker,' ')[safe_offset(1)] as last_name,
false as is_inferred_identity
from
`ra-development.zapier_source.calendly_booking`
union all
select
id,
timestamp,
email as anonymous_id,
cast(null as string) as context_ip,
cast(null as string) as context_page_path,
cast(null as string) as context_page_referrer,
cast(null as string) as context_page_title,
cast(null as string) as context_user_agent,
'booked meeting' as event_text,
cast(null as string) as target,
coalesce(meeting_purpose,
event_text) as intent,
email as email,
split(full_name,' ')[safe_offset(0)] as first_name,
split(full_name,' ')[safe_offset(1)] as last_name,
false as is_inferred_identity
from
`ra-development.zapier_source.meeting_booked`
union all
select
id,
timestamp,
user_id as anonymous_id,
cast(null as string) as context_ip,
cast(null as string) as context_page_path,
cast(null as string) as context_page_referrer,
cast(null as string) as context_page_title,
cast(null as string) as context_user_agent,
'booked meeting' as event_text,
cast(null as string) as target,
coalesce(type,
event_text) as intent,
user_id as email,
cast(null as string) as first_name,
cast(null as string) as last_name,
false as is_inferred_identity
from
`ra-development.zapier_source.appointment_booked` ),
meeting_requests as -- segment events that were the request for the calendly meeting
(
select
-- by matching these meeting requests to the calendly meetings that they resulted in, we can retrieve the name of the booker
r.* except (intent,
email,
first_name,
last_name),
b.intent,
b.email,
b.first_name,
b.last_name,
true as is_inferred_identity
from ( -- the meeting request, sans the booker email and name. again historically there have been three definitions of this event over time
select
id,
timestamp,
anonymous_id,
context_ip,
context_page_path,
context_page_referrer,
context_page_title,
context_user_agent,
'requested meeting' as event_text,
target,
cast(null as string) as intent,
cast(null as string) as email,
cast(null as string) as first_name,
cast(null as string) as last_name
from
`ra-development.company_website.pressed_button`
where
target like '%calendly%'
or target like '%hsforms%'
union all
select
id,
timestamp,
anonymous_id,
context_ip,
context_page_path,
context_page_referrer,
context_page_title,
context_user_agent,
'requested meeting' as event_text,
target,
cast(null as string) as intent,
cast(null as string) as email,
cast(null as string) as first_name,
cast(null as string) as last_name
from
`ra-development.company_website.pressed_a_button`
where
target like '%calendly%'
or target like '%hsforms%'
union all
select
id,
timestamp,
anonymous_id,
context_ip,
context_page_path,
context_page_referrer,
context_page_title,
context_user_agent,
'requested meeting' as event_text,
cast(null as string) as target,
cast(null as string) as intent,
cast(null as string) as email,
cast(null as string) as first_name,
cast(null as string) as last_name
from
`ra-development.company_website.contact_us_pressed` ) r
left join
meeting_bookings b
on
-- this is where we match booking requests to booked meetings, doing it by time (within 60 minutes)
timestamp_diff(r.timestamp,b.timestamp,minute) between 0
and 60 ),
form_submissions as -- for form submissions in some cases we do get the requestor email address
(
select
* except (__row_number)
from (
select
id,
timestamp,
anonymous_id,
context_ip,
context_page_path,
context_page_referrer,
context_page_title,
context_user_agent,
'submitted enquiry' as event_text,
cast(null as string) as target,
interested_in as intent,
email as email,
cast(null as string) as first_name,
last_name as last_name,
false as is_inferred_identity,
row_number() over (partition by email, date(timestamp)
order by
timestamp desc) as __row_number
from
`ra-development.company_website.contact_form_submitted` )
where
__row_number = 1 ),
email_clicks as -- for email clicks we do get their email address in the querystring
(
select
id,
timestamp,
anonymous_id,
context_ip,
context_page_path,
context_page_referrer,
context_page_title,
context_user_agent,
'clicked email' as event_text,
cast(null as string) as target,
email_campaign as intent,
coalesce(email,
split(split(context_page_url,'ajs_aid=')[safe_offset(1)],'&')[safe_offset(0)]) as email,
cast(null as string) as first_name,
cast(null as string) as last_name,
false as is_inferred_identity,
from
`ra-development.company_website.clicked_email`
union all
select
id,
timestamp,
anonymous_id,
context_ip,
context_page_path,
context_page_referrer,
context_page_title,
context_user_agent,
'clicked email' as event_text,
cast(null as string) as target,
email_campaign as intent,
coalesce(email,
split(split(context_page_url,'ajs_aid=')[safe_offset(1)],'&')[safe_offset(0)]) as email,
cast(null as string) as first_name,
cast(null as string) as last_name,
false as is_inferred_identity,
from
`ra-development.company_website.clicked_email_link` ),
other_events as -- these are the other touchpoints of interest in the customer journey
(
select
id,
timestamp,
anonymous_id,
context_ip,
context_page_path,
context_page_referrer,
context_page_title,
context_user_agent,
'viewed page' as event_text,
cast(null as string) as target,
cast(null as string) as intent,
cast(null as string) as email,
cast(null as string) as first_name,
cast(null as string) as last_name,
false as is_inferred_identity
from
`ra-development.company_website.pages`
union all
select
id,
timestamp,
anonymous_id,
context_ip,
context_page_path,
context_page_referrer,
context_page_title,
context_user_agent,
'played podcast' as event_text,
cast(null as string) as target,
episode as intent,
cast(null as string) as email,
cast(null as string) as first_name,
cast(null as string) as last_name,
false as is_inferred_identity
from
`ra-development.company_website.podcast_episode_played`
union all
select
id,
timestamp,
anonymous_id,
context_ip,
context_page_path,
context_page_referrer,
context_page_title,
context_user_agent,
'viewed pricing' as event_text,
cast(null as string) as target,
cast(null as string) as intent,
cast(null as string) as email,
cast(null as string) as first_name,
cast(null as string) as last_name,
false as is_inferred_identity
from
`ra-development.company_website.pricing_link_clicked`
union all
select
id,
timestamp,
anonymous_id,
context_ip,
context_page_path,
context_page_referrer,
context_page_title,
context_user_agent,
'viewed collateral' as event_text,
cast(null as string) as target,
cast(null as string) as intent,
cast(null as string) as email,
cast(null as string) as first_name,
cast(null as string) as last_name,
false as is_inferred_identity
from
`ra-development.company_website.collateral_viewed`
union all
select
id,
timestamp,
anonymous_id,
context_ip,
context_page_path,
context_page_referrer,
context_page_title,
context_user_agent,
'viewed collateral' as event_text,
cast(null as string) as target,
technologies as intent,
cast(null as string) as email,
cast(null as string) as first_name,
cast(null as string) as last_name,
false as is_inferred_identity
from
`ra-development.company_website.casestudy_clicked` ),
ra_events as -- now we union all of these events together
(
select
*
from
meeting_bookings
union all
select
*
from
meeting_requests
union all
select
*
from
form_submissions
union all
select
*
from
email_clicks
union all
select
*
from
other_events ),
social_media as -- linkedin followers for now, scraped using phantombuster and landed in bq using coupler.io
(
select
cast(row_number() over (order by parse_timestamp('%b %y',l.followedat)) as string) as id,
cast(parse_timestamp('%b %y',l.followedat) as timestamp) as timestamp,
l.profileurl as anonymous_id,
cast(null as string) as context_ip,
cast(null as string) as context_page_path,
cast(null as string) as context_page_referrer,
cast(null as string) as context_page_title,
cast(null as string) as context_user_agent,
'followed us' as event_text,
cast(null as string) as target,
cast(null as string) as intent,
cast(coalesce(p.email,
r.email) as string) as email,
coalesce(p.firstname,
l.firstname) as first_name,
coalesce(p.lastname,
l.lastname) as last_name,
true as is_inferred_identity
from
`ra-development.coupler_socialmedia.linkedin_followers` l
left join
`ra-development.coupler_socialmedia.linkedin_profiles` p
on
l.profileurl = p.linkedinprofile
left join
ra_events r -- if we have a user name in linkedin, we try and supplement with email from our own data
on
l.firstname = r.first_name
and l.lastname = r.last_name
union all
-- add linkedin post likers now, and supplement with email address from our own data if we can match on name
select
cast(row_number() over (order by l.timestamp) as string) as id,
l.timestamp,
l.profilelink as anonymous_id,
cast(null as string) as context_ip,
cast(null as string) as context_page_path,
cast(null as string) as context_page_referrer,
cast(null as string) as context_page_title,
cast(null as string) as context_user_agent,
'liked post' as event_text,
cast(null as string) as target,
cast(null as string) as intent,
cast(coalesce(p.email,
r.email) as string) as email,
coalesce(p.firstname,
l.firstname) as first_name,
coalesce(p.lastname,
l.lastname) as last_name,
true as is_inferred_identity
from
`ra-development.coupler_socialmedia.linkedin_post_likers` l
left join
`ra-development.coupler_socialmedia.linkedin_profiles` p
on
l.profilelink = p.linkedinprofile
left join
ra_events r
on
l.firstname = r.first_name
and l.lastname = r.last_name
where
l.profilelink is not null ),
all_events as ( -- union together the social media data and our own events
select
*
from
ra_events
union all
select
*
from
social_media ),
id_stitching as (
select
distinct anonymous_id as anonymous_id,
last_value(email ignore nulls) over (partition by anonymous_id order by timestamp rows between unbounded preceding and unbounded following ) as user_id,
min(timestamp) over (partition by anonymous_id ) as first_seen_at,
max(timestamp) over (partition by anonymous_id ) as last_seen_at
from
all_events ),
mapped as (
select
coalesce(i.user_id,
e.anonymous_id) as blended_user_id,
e.event_text,
e.* except (event_text)
from
all_events e
left join
id_stitching i
using
(anonymous_id) ),
names_filled as (
select
* except (first_name,
last_name,
email),
last_value(email ignore nulls) over (partition by blended_user_id order by timestamp rows between unbounded preceding and unbounded following ) as email,
last_value(first_name ignore nulls) over (partition by blended_user_id order by timestamp rows between unbounded preceding and unbounded following ) as first_name,
last_value(last_name ignore nulls) over (partition by blended_user_id order by timestamp rows between unbounded preceding and unbounded following ) as last_name
from
mapped ),
flattened as (
select
blended_user_id,
row_number() over (partition by blended_user_id order by timestamp) as event_seq,
timestamp,
event_text,
id,
context_ip as ip,
context_page_path as page_path,
context_page_referrer as referrer,
context_page_title as page_title,
target as target,
intent as intent,
context_user_agent as user_agent,
email,
concat(first_name,' ',last_name) as full_name,
anonymous_id,
is_inferred_identity
from
names_filled ),
nested as (
select
blended_user_id,
full_name,
email,
logical_or(is_inferred_identity) as inferred_identity,
array_agg( struct( event_seq,
is_inferred_identity,
event_text,
timestamp,
page_path,
referrer,
page_title,
target,
intent )
order by
event_seq ) touchpoint
from
flattened
group by
1,
2,
3 )
select
*
from
nested
where
contains_substr(blended_user_id,
'@')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment