Created
May 23, 2022 20:50
-
-
Save markrittman/088de6b260dbc4bc04290d14bb0e846f to your computer and use it in GitHub Desktop.
Example BigQuery SQL statement to stitch user identities together across multiple Segment and other sources based on inferred connections
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
with | |
meeting_bookings as # meetings have names and email addresses, but no anonymous_id | |
( | |
select | |
# union of three events that are logically the same but were recorded as three different events historically | |
id, | |
timestamp, | |
email as anonymous_id, | |
cast(null as string) as context_ip, | |
cast(null as string) as context_page_path, | |
cast(null as string) as context_page_referrer, | |
cast(null as string) as context_page_title, | |
cast(null as string) as context_user_agent, | |
'booked meeting' as event_text, | |
cast(null as string) as target, | |
event_text as intent, | |
email as email, | |
split(event_booker,' ')[safe_offset(0)] as first_name, | |
split(event_booker,' ')[safe_offset(1)] as last_name, | |
false as is_inferred_identity | |
from | |
`ra-development.zapier_source.calendly_booking` | |
union all | |
select | |
id, | |
timestamp, | |
email as anonymous_id, | |
cast(null as string) as context_ip, | |
cast(null as string) as context_page_path, | |
cast(null as string) as context_page_referrer, | |
cast(null as string) as context_page_title, | |
cast(null as string) as context_user_agent, | |
'booked meeting' as event_text, | |
cast(null as string) as target, | |
coalesce(meeting_purpose, | |
event_text) as intent, | |
email as email, | |
split(full_name,' ')[safe_offset(0)] as first_name, | |
split(full_name,' ')[safe_offset(1)] as last_name, | |
false as is_inferred_identity | |
from | |
`ra-development.zapier_source.meeting_booked` | |
union all | |
select | |
id, | |
timestamp, | |
user_id as anonymous_id, | |
cast(null as string) as context_ip, | |
cast(null as string) as context_page_path, | |
cast(null as string) as context_page_referrer, | |
cast(null as string) as context_page_title, | |
cast(null as string) as context_user_agent, | |
'booked meeting' as event_text, | |
cast(null as string) as target, | |
coalesce(type, | |
event_text) as intent, | |
user_id as email, | |
cast(null as string) as first_name, | |
cast(null as string) as last_name, | |
false as is_inferred_identity | |
from | |
`ra-development.zapier_source.appointment_booked` ), | |
meeting_requests as -- segment events that were the request for the calendly meeting | |
( | |
select | |
-- by matching these meeting requests to the calendly meetings that they resulted in, we can retrieve the name of the booker | |
r.* except (intent, | |
email, | |
first_name, | |
last_name), | |
b.intent, | |
b.email, | |
b.first_name, | |
b.last_name, | |
true as is_inferred_identity | |
from ( -- the meeting request, sans the booker email and name. again historically there have been three definitions of this event over time | |
select | |
id, | |
timestamp, | |
anonymous_id, | |
context_ip, | |
context_page_path, | |
context_page_referrer, | |
context_page_title, | |
context_user_agent, | |
'requested meeting' as event_text, | |
target, | |
cast(null as string) as intent, | |
cast(null as string) as email, | |
cast(null as string) as first_name, | |
cast(null as string) as last_name | |
from | |
`ra-development.company_website.pressed_button` | |
where | |
target like '%calendly%' | |
or target like '%hsforms%' | |
union all | |
select | |
id, | |
timestamp, | |
anonymous_id, | |
context_ip, | |
context_page_path, | |
context_page_referrer, | |
context_page_title, | |
context_user_agent, | |
'requested meeting' as event_text, | |
target, | |
cast(null as string) as intent, | |
cast(null as string) as email, | |
cast(null as string) as first_name, | |
cast(null as string) as last_name | |
from | |
`ra-development.company_website.pressed_a_button` | |
where | |
target like '%calendly%' | |
or target like '%hsforms%' | |
union all | |
select | |
id, | |
timestamp, | |
anonymous_id, | |
context_ip, | |
context_page_path, | |
context_page_referrer, | |
context_page_title, | |
context_user_agent, | |
'requested meeting' as event_text, | |
cast(null as string) as target, | |
cast(null as string) as intent, | |
cast(null as string) as email, | |
cast(null as string) as first_name, | |
cast(null as string) as last_name | |
from | |
`ra-development.company_website.contact_us_pressed` ) r | |
left join | |
meeting_bookings b | |
on | |
-- this is where we match booking requests to booked meetings, doing it by time (within 60 minutes) | |
timestamp_diff(r.timestamp,b.timestamp,minute) between 0 | |
and 60 ), | |
form_submissions as -- for form submissions in some cases we do get the requestor email address | |
( | |
select | |
* except (__row_number) | |
from ( | |
select | |
id, | |
timestamp, | |
anonymous_id, | |
context_ip, | |
context_page_path, | |
context_page_referrer, | |
context_page_title, | |
context_user_agent, | |
'submitted enquiry' as event_text, | |
cast(null as string) as target, | |
interested_in as intent, | |
email as email, | |
cast(null as string) as first_name, | |
last_name as last_name, | |
false as is_inferred_identity, | |
row_number() over (partition by email, date(timestamp) | |
order by | |
timestamp desc) as __row_number | |
from | |
`ra-development.company_website.contact_form_submitted` ) | |
where | |
__row_number = 1 ), | |
email_clicks as -- for email clicks we do get their email address in the querystring | |
( | |
select | |
id, | |
timestamp, | |
anonymous_id, | |
context_ip, | |
context_page_path, | |
context_page_referrer, | |
context_page_title, | |
context_user_agent, | |
'clicked email' as event_text, | |
cast(null as string) as target, | |
email_campaign as intent, | |
coalesce(email, | |
split(split(context_page_url,'ajs_aid=')[safe_offset(1)],'&')[safe_offset(0)]) as email, | |
cast(null as string) as first_name, | |
cast(null as string) as last_name, | |
false as is_inferred_identity, | |
from | |
`ra-development.company_website.clicked_email` | |
union all | |
select | |
id, | |
timestamp, | |
anonymous_id, | |
context_ip, | |
context_page_path, | |
context_page_referrer, | |
context_page_title, | |
context_user_agent, | |
'clicked email' as event_text, | |
cast(null as string) as target, | |
email_campaign as intent, | |
coalesce(email, | |
split(split(context_page_url,'ajs_aid=')[safe_offset(1)],'&')[safe_offset(0)]) as email, | |
cast(null as string) as first_name, | |
cast(null as string) as last_name, | |
false as is_inferred_identity, | |
from | |
`ra-development.company_website.clicked_email_link` ), | |
other_events as -- these are the other touchpoints of interest in the customer journey | |
( | |
select | |
id, | |
timestamp, | |
anonymous_id, | |
context_ip, | |
context_page_path, | |
context_page_referrer, | |
context_page_title, | |
context_user_agent, | |
'viewed page' as event_text, | |
cast(null as string) as target, | |
cast(null as string) as intent, | |
cast(null as string) as email, | |
cast(null as string) as first_name, | |
cast(null as string) as last_name, | |
false as is_inferred_identity | |
from | |
`ra-development.company_website.pages` | |
union all | |
select | |
id, | |
timestamp, | |
anonymous_id, | |
context_ip, | |
context_page_path, | |
context_page_referrer, | |
context_page_title, | |
context_user_agent, | |
'played podcast' as event_text, | |
cast(null as string) as target, | |
episode as intent, | |
cast(null as string) as email, | |
cast(null as string) as first_name, | |
cast(null as string) as last_name, | |
false as is_inferred_identity | |
from | |
`ra-development.company_website.podcast_episode_played` | |
union all | |
select | |
id, | |
timestamp, | |
anonymous_id, | |
context_ip, | |
context_page_path, | |
context_page_referrer, | |
context_page_title, | |
context_user_agent, | |
'viewed pricing' as event_text, | |
cast(null as string) as target, | |
cast(null as string) as intent, | |
cast(null as string) as email, | |
cast(null as string) as first_name, | |
cast(null as string) as last_name, | |
false as is_inferred_identity | |
from | |
`ra-development.company_website.pricing_link_clicked` | |
union all | |
select | |
id, | |
timestamp, | |
anonymous_id, | |
context_ip, | |
context_page_path, | |
context_page_referrer, | |
context_page_title, | |
context_user_agent, | |
'viewed collateral' as event_text, | |
cast(null as string) as target, | |
cast(null as string) as intent, | |
cast(null as string) as email, | |
cast(null as string) as first_name, | |
cast(null as string) as last_name, | |
false as is_inferred_identity | |
from | |
`ra-development.company_website.collateral_viewed` | |
union all | |
select | |
id, | |
timestamp, | |
anonymous_id, | |
context_ip, | |
context_page_path, | |
context_page_referrer, | |
context_page_title, | |
context_user_agent, | |
'viewed collateral' as event_text, | |
cast(null as string) as target, | |
technologies as intent, | |
cast(null as string) as email, | |
cast(null as string) as first_name, | |
cast(null as string) as last_name, | |
false as is_inferred_identity | |
from | |
`ra-development.company_website.casestudy_clicked` ), | |
ra_events as -- now we union all of these events together | |
( | |
select | |
* | |
from | |
meeting_bookings | |
union all | |
select | |
* | |
from | |
meeting_requests | |
union all | |
select | |
* | |
from | |
form_submissions | |
union all | |
select | |
* | |
from | |
email_clicks | |
union all | |
select | |
* | |
from | |
other_events ), | |
social_media as -- linkedin followers for now, scraped using phantombuster and landed in bq using coupler.io | |
( | |
select | |
cast(row_number() over (order by parse_timestamp('%b %y',l.followedat)) as string) as id, | |
cast(parse_timestamp('%b %y',l.followedat) as timestamp) as timestamp, | |
l.profileurl as anonymous_id, | |
cast(null as string) as context_ip, | |
cast(null as string) as context_page_path, | |
cast(null as string) as context_page_referrer, | |
cast(null as string) as context_page_title, | |
cast(null as string) as context_user_agent, | |
'followed us' as event_text, | |
cast(null as string) as target, | |
cast(null as string) as intent, | |
cast(coalesce(p.email, | |
r.email) as string) as email, | |
coalesce(p.firstname, | |
l.firstname) as first_name, | |
coalesce(p.lastname, | |
l.lastname) as last_name, | |
true as is_inferred_identity | |
from | |
`ra-development.coupler_socialmedia.linkedin_followers` l | |
left join | |
`ra-development.coupler_socialmedia.linkedin_profiles` p | |
on | |
l.profileurl = p.linkedinprofile | |
left join | |
ra_events r -- if we have a user name in linkedin, we try and supplement with email from our own data | |
on | |
l.firstname = r.first_name | |
and l.lastname = r.last_name | |
union all | |
-- add linkedin post likers now, and supplement with email address from our own data if we can match on name | |
select | |
cast(row_number() over (order by l.timestamp) as string) as id, | |
l.timestamp, | |
l.profilelink as anonymous_id, | |
cast(null as string) as context_ip, | |
cast(null as string) as context_page_path, | |
cast(null as string) as context_page_referrer, | |
cast(null as string) as context_page_title, | |
cast(null as string) as context_user_agent, | |
'liked post' as event_text, | |
cast(null as string) as target, | |
cast(null as string) as intent, | |
cast(coalesce(p.email, | |
r.email) as string) as email, | |
coalesce(p.firstname, | |
l.firstname) as first_name, | |
coalesce(p.lastname, | |
l.lastname) as last_name, | |
true as is_inferred_identity | |
from | |
`ra-development.coupler_socialmedia.linkedin_post_likers` l | |
left join | |
`ra-development.coupler_socialmedia.linkedin_profiles` p | |
on | |
l.profilelink = p.linkedinprofile | |
left join | |
ra_events r | |
on | |
l.firstname = r.first_name | |
and l.lastname = r.last_name | |
where | |
l.profilelink is not null ), | |
all_events as ( -- union together the social media data and our own events | |
select | |
* | |
from | |
ra_events | |
union all | |
select | |
* | |
from | |
social_media ), | |
id_stitching as ( | |
select | |
distinct anonymous_id as anonymous_id, | |
last_value(email ignore nulls) over (partition by anonymous_id order by timestamp rows between unbounded preceding and unbounded following ) as user_id, | |
min(timestamp) over (partition by anonymous_id ) as first_seen_at, | |
max(timestamp) over (partition by anonymous_id ) as last_seen_at | |
from | |
all_events ), | |
mapped as ( | |
select | |
coalesce(i.user_id, | |
e.anonymous_id) as blended_user_id, | |
e.event_text, | |
e.* except (event_text) | |
from | |
all_events e | |
left join | |
id_stitching i | |
using | |
(anonymous_id) ), | |
names_filled as ( | |
select | |
* except (first_name, | |
last_name, | |
email), | |
last_value(email ignore nulls) over (partition by blended_user_id order by timestamp rows between unbounded preceding and unbounded following ) as email, | |
last_value(first_name ignore nulls) over (partition by blended_user_id order by timestamp rows between unbounded preceding and unbounded following ) as first_name, | |
last_value(last_name ignore nulls) over (partition by blended_user_id order by timestamp rows between unbounded preceding and unbounded following ) as last_name | |
from | |
mapped ), | |
flattened as ( | |
select | |
blended_user_id, | |
row_number() over (partition by blended_user_id order by timestamp) as event_seq, | |
timestamp, | |
event_text, | |
id, | |
context_ip as ip, | |
context_page_path as page_path, | |
context_page_referrer as referrer, | |
context_page_title as page_title, | |
target as target, | |
intent as intent, | |
context_user_agent as user_agent, | |
email, | |
concat(first_name,' ',last_name) as full_name, | |
anonymous_id, | |
is_inferred_identity | |
from | |
names_filled ), | |
nested as ( | |
select | |
blended_user_id, | |
full_name, | |
email, | |
logical_or(is_inferred_identity) as inferred_identity, | |
array_agg( struct( event_seq, | |
is_inferred_identity, | |
event_text, | |
timestamp, | |
page_path, | |
referrer, | |
page_title, | |
target, | |
intent ) | |
order by | |
event_seq ) touchpoint | |
from | |
flattened | |
group by | |
1, | |
2, | |
3 ) | |
select | |
* | |
from | |
nested | |
where | |
contains_substr(blended_user_id, | |
'@') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment