Skip to content

Instantly share code, notes, and snippets.

@rachelannelise
Created July 21, 2017 01:58
Show Gist options
  • Save rachelannelise/922f153d2c47ffb7d5c68b1e1c4cde08 to your computer and use it in GitHub Desktop.
Save rachelannelise/922f153d2c47ffb7d5c68b1e1c4cde08 to your computer and use it in GitHub Desktop.
DPL Star Schema Queries
--CONTENT_DIM:
--Grain: 1 row per apikey, post
select distinct
--- ids
row_number() OVER (ORDER BY
apikey,
metadata_post_id) as content_dim_id,
--Redshift query creations
create view content_dim as
select distinct
--- ids
apikey,
metadata_post_id,
--- derived dimensions
case
when metadata_full_content_word_count >= 4000 then '4,000 or Above'
when metadata_full_content_word_count >= 3000 then '3,000 - 3,999'
when metadata_full_content_word_count >= 2000 then '2,000 - 2,999'
when metadata_full_content_word_count >= 1000 then '1,000 - 1,999'
when metadata_full_content_word_count >= 500 then '500 - 999'
when metadata_full_content_word_count >= 100 then '100 - 499'
else '< 100' end as word_count_buckets,
--- dimensions
metadata_authors,
metadata_full_content_word_count,
metadata_page_type,
metadata_pub_date_tmsp,
metadata_canonical_url,
metadata_section,
metadata_tags,
metadata_title
from public.demo_mode_dpl
where metadata_post_id is not null;
--SESSION_CONTENT_FACT:
--Grain: 1 row per apikey, session, user, post, page type
--Metrics: engaged_time, pageviews
select
--- ids
row_number() OVER (ORDER BY DPL.apikey,
DPL.session_id,
DPL.campaign_id,
DPL.visitor_ip,
DPL.visitor_site_id,
DPL.metadata_post_id,
DPL.metadata_page_type) session_content_fact_id,
DPL.apikey,
DPL.session_id,
DPL.campaign_id,
DPL.visitor_ip,
DPL.visitor_site_id,
DPL.metadata_post_id,
DPL.metadata_page_type,
--- metrics
sum(DPL.engaged_time_inc) as engaged_time,
PV.pageviews
from parsely_dpl DPL
join
(select
apikey,
session_id,
campaign_id,
visitor_ip,
visitor_site_id,
metadata_page_type,
metadata_post_id,
count(session_id) as pageviews
from parsely_dpl
where action='pageview'
group by apikey, metadata_post_id, session_id, campaign_id, visitor_ip, visitor_site_id, metadata_page_type)PV
on PV.apikey = DPL.apikey
and PV.session_id = DPL.session_id
and PV.visitor_ip = DPL.visitor_ip
and PV.visitor_site_id = DPL.visitor_site_id
and PV.metadata_page_type = DPL.metadata_page_type
and PV.metadata_post_id = DPL.metadata_post_id
group by DPL.apikey, DPL.session_id, DPL.campaign_id, DPL.visitor_ip, DPL.visitor_site_id, DPL.metadata_post_id, DPL.metadata_page_type,PV.pageviews;
--redshift view creation
create view session_content_fact as
select
--- ids
DPL.apikey,
DPL.session_id,
DPL.campaign_id,
DPL.visitor_ip,
DPL.visitor_site_id,
DPL.metadata_post_id,
DPL.metadata_page_type,
--- metrics
sum(DPL.engaged_time_inc) as engaged_time,
PV.pageviews
from public.demo_mode_dpl DPL
join
(select
apikey,
session_id,
campaign_id,
visitor_ip,
visitor_site_id,
metadata_page_type,
metadata_post_id,
count(session_id) as pageviews
from public.demo_mode_dpl
where action='pageview'
group by apikey, metadata_post_id, session_id, campaign_id, visitor_ip, visitor_site_id, metadata_page_type)PV
on PV.apikey = DPL.apikey
and PV.session_id = DPL.session_id
and PV.visitor_ip = DPL.visitor_ip
and PV.visitor_site_id = DPL.visitor_site_id
and PV.metadata_page_type = DPL.metadata_page_type
and PV.metadata_post_id = DPL.metadata_post_id
group by DPL.apikey, DPL.session_id, DPL.campaign_id, DPL.visitor_ip, DPL.visitor_site_id, DPL.metadata_post_id, DPL.metadata_page_type,PV.pageviews;
--SESSION_DETAIL_DIM:
--Grain: 1 row per apikey, session, user, post (both pageviews and heartbeats)
select distinct
--- ids
row_number() OVER (ORDER BY
apikey,
session_id,
campaign_id,
event_id,
visitor_ip,
visitor_site_id,
metadata_post_id,
action) as session_detail_dim_id,
apikey,
session_id,
campaign_id,
event_id,
visitor_ip,
visitor_site_id,
metadata_post_id,
--- dimensions
action,
timestamp_info,
timestamp_info_nginx_ms,
timestamp_info_override_ms,
timestamp_info_pixel_ms,
ts_action,
ts_session_current,
ts_session_last
from parsely_dpl
where metadata_page_type = 'post'
order by visitor_ip, session_id asc;
--redshift view creation
create view session_detail_dim as
select distinct
--- ids
apikey,
session_id,
campaign_id,
event_id,
visitor_ip,
visitor_site_id,
metadata_post_id,
--- dimensions
action,
timestamp_info,
timestamp_info_nginx_ms,
timestamp_info_override_ms,
timestamp_info_pixel_ms,
ts_action,
ts_session_current,
ts_session_last
from public.demo_mode_dpl
where metadata_page_type = 'post';
--SESSION_DIM:
--Grain: 1 row per apikey, session, user (pageviews only)
--Redshift view create\
create view session_dim as
select distinct
--- ids
DPL.apikey,
DPL.session_id,
campaign_id,
DPL.event_id,
DPL.visitor_ip,
DPL.visitor_site_id,
DPL.metadata_post_id,
--- derived dimensions
case
when engaged_time > 30 then 'Deep Read'
when engaged_time > 10 then 'Read'
else 'Skim' end as read_category,
datediff(hour,
(TIMESTAMP 'epoch' + left(metadata_pub_date_tmsp,10)::bigint * INTERVAL '1 Second '),
(TIMESTAMP 'epoch' + left(timestamp_info_nginx_ms,10)::bigint * INTERVAL '1 Second ')) as hours_since_published,
datediff(day,
(TIMESTAMP 'epoch' + left(metadata_pub_date_tmsp,10)::bigint * INTERVAL '1 Second '),
(TIMESTAMP 'epoch' + left(timestamp_info_nginx_ms,10)::bigint * INTERVAL '1 Second ')) as days_since_published,
datediff(week,
(TIMESTAMP 'epoch' + left(metadata_pub_date_tmsp,10)::bigint * INTERVAL '1 Second '),
(TIMESTAMP 'epoch' + left(timestamp_info_nginx_ms,10)::bigint * INTERVAL '1 Second ')) as weeks_since_published,
--- dimensions
action,
ip_city,
ip_continent,
ip_country,
ip_lat,
ip_lon,
ip_postal,
ref_category,
ref_clean,
ref_domain,
ref_netloc,
ref_path,
referrer,
session_initial_referrer,
session_initial_url,
sref_category,
sref_clean,
sref_domain,
sref_netloc,
surl_clean,
surl_domain,
surl_netloc,
ua_browser,
ua_device,
ua_devicetype,
url,
url_clean,
url_domain
from public.demo_mode_dpl DPL
join (
select
apikey,
session_id,
visitor_ip,
visitor_site_id,
metadata_post_id,
sum(engaged_time_inc) as engaged_time
from public.demo_mode_dpl
group by
apikey,
session_id,
visitor_ip,
visitor_site_id,
metadata_post_id
having sum(engaged_time_inc) >0
) E
on E.apikey = DPL.apikey
and E.session_id = DPL.session_id
and E.visitor_ip = DPL.visitor_ip
and E.visitor_site_id = DPL.visitor_site_id
and E.metadata_post_id = DPL.metadata_post_id
where action = 'pageview'
and metadata_page_type = 'post';
--SESSION_FACT:
--Grain: 1 row per apikey, session, user, page type
--Metrics: engaged_time, pageviews
--Calculated: entry_url, exit_url
select
--- ids
row_number() OVER (ORDER BY DPL.apikey,
DPL.session_id,
DPL.campaign_id,
DPL.visitor_ip,
DPL.visitor_site_id,
DPL.metadata_page_type) session_fact_id,
DPL.apikey,
DPL.session_id,
DPL.campaign_id,
DPL.visitor_ip,
DPL.visitor_site_id,
DPL.metadata_page_type,
--- metrics
sum(DPL.engaged_time_inc) as f,
PV.pageviews,
PV.visitors,
EE.entry_url,
EE.exit_url,
1 as session_counter
--- dimensions
from parsely_dpl DPL
join
-- min/max entry page
(select distinct
SP.apikey,
SP.session_id,
SP.visitor_ip,
SP.visitor_site_id,
SPMin.url_clean as entry_url,
SPMax.url_clean as exit_url,
SP.metadata_page_type
from
(select distinct
apikey,
session_id,
campaign_id,
visitor_ip,
visitor_site_id,
metadata_page_type,
url_clean,
timestamp_info_nginx_ms
from parsely_dpl) SPMin
join
(select
apikey,
session_id,
campaign_id,
visitor_ip,
visitor_site_id,
metadata_page_type,
min(timestamp_info_nginx_ms) as session_min_timestamp,
max(timestamp_info_nginx_ms) as session_max_timestamp
from parsely_dpl
group by apikey, session_id, campaign_id, visitor_ip, visitor_site_id, metadata_page_type) SP
on
(SPMin.timestamp_info_nginx_ms = SP.session_min_timestamp)
and SP.session_id = SPMin.session_id
and SP.visitor_site_id = SPMin.visitor_site_id
and SP.visitor_ip = SPMin.visitor_ip
and SP.metadata_page_type = SPMin.metadata_page_type
and SP.apikey = SPMin.apikey
join
(select distinct
apikey,
session_id,
campaign_id,
visitor_ip,
visitor_site_id,
metadata_page_type,
url_clean,
timestamp_info_nginx_ms
from parsely_dpl) SPMax
on
(SPMax.timestamp_info_nginx_ms = SP.session_max_timestamp)
and SP.session_id = SPMax.session_id
and SP.visitor_site_id = SPMax.visitor_site_id
and SP.visitor_ip = SPMax.visitor_ip
and SP.metadata_page_type = SPMax.metadata_page_type
and SP.apikey = SPMax.apikey
order by SP.visitor_site_id) EE
on EE.visitor_ip = DPL.visitor_ip
and EE.session_id = DPL.session_id
and EE.apikey = DPL.apikey
and EE.visitor_site_id = DPL.visitor_site_id
and EE.apikey = DPL.apikey
and EE.metadata_page_type = DPL.metadata_page_type
join
(select
apikey,
session_id,
campaign_id,
visitor_ip,
visitor_site_id,
metadata_page_type,
count(session_id) as pageviews,
count(distinct visitor_site_id) as visitors
from parsely_dpl
where action='pageview'
group by apikey, session_id, campaign_id, visitor_ip, visitor_site_id, metadata_page_type)PV
on PV.apikey = DPL.apikey
and PV.session_id = DPL.session_id
and PV.visitor_ip = DPL.visitor_ip
and PV.visitor_site_id = DPL.visitor_site_id
and PV.metadata_page_type = DPL.metadata_page_type
group by DPL.apikey, DPL.session_id, DPL.campaign_id, DPL.visitor_ip, DPL.visitor_site_id, DPL.metadata_page_type,PV.pageviews,PV.visitors,
EE.entry_url,
EE.exit_url;
--Redshift view create
create view session_fact as
select
--- ids
DPL.apikey,
DPL.session_id,
DPL.campaign_id,
DPL.visitor_ip,
DPL.visitor_site_id,
DPL.metadata_page_type,
--- metrics
sum(DPL.engaged_time_inc) as engaged_time,
PV.visitors,
EE.entry_url,
EE.exit_url,
1 as session_counter
--- dimensions
from public.demo_mode_dpl DPL
join
-- min/max entry page
(select distinct
SP.apikey,
SP.session_id,
SP.visitor_ip,
SP.visitor_site_id,
SPMin.url_clean as entry_url,
SPMax.url_clean as exit_url,
SP.metadata_page_type
from
(select distinct
apikey,
session_id,
campaign_id,
visitor_ip,
visitor_site_id,
metadata_page_type,
url_clean,
timestamp_info_nginx_ms
from public.demo_mode_dpl) SPMin
join
(select
apikey,
session_id,
campaign_id,
visitor_ip,
visitor_site_id,
metadata_page_type,
min(timestamp_info_nginx_ms) as session_min_timestamp,
max(timestamp_info_nginx_ms) as session_max_timestamp
from public.demo_mode_dpl
group by apikey, session_id, campaign_id, visitor_ip, visitor_site_id, metadata_page_type) SP
on
(SPMin.timestamp_info_nginx_ms = SP.session_min_timestamp)
and SP.session_id = SPMin.session_id
and SP.visitor_site_id = SPMin.visitor_site_id
and SP.visitor_ip = SPMin.visitor_ip
and SP.metadata_page_type = SPMin.metadata_page_type
and SP.apikey = SPMin.apikey
join
(select distinct
apikey,
session_id,
campaign_id,
visitor_ip,
visitor_site_id,
metadata_page_type,
url_clean,
timestamp_info_nginx_ms
from public.demo_mode_dpl) SPMax
on
(SPMax.timestamp_info_nginx_ms = SP.session_max_timestamp)
and SP.session_id = SPMax.session_id
and SP.visitor_site_id = SPMax.visitor_site_id
and SP.visitor_ip = SPMax.visitor_ip
and SP.metadata_page_type = SPMax.metadata_page_type
and SP.apikey = SPMax.apikey
order by SP.visitor_site_id) EE
on EE.visitor_ip = DPL.visitor_ip
and EE.session_id = DPL.session_id
and EE.apikey = DPL.apikey
and EE.visitor_site_id = DPL.visitor_site_id
and EE.apikey = DPL.apikey
and EE.metadata_page_type = DPL.metadata_page_type
join
(select
apikey,
session_id,
campaign_id,
visitor_ip,
visitor_site_id,
metadata_page_type,
count(session_id) as pageviews,
count(distinct visitor_site_id) as visitors
from public.demo_mode_dpl
where action='pageview'
group by apikey, session_id, campaign_id, visitor_ip, visitor_site_id, metadata_page_type)PV
on PV.apikey = DPL.apikey
and PV.session_id = DPL.session_id
and PV.visitor_ip = DPL.visitor_ip
and PV.visitor_site_id = DPL.visitor_site_id
and PV.metadata_page_type = DPL.metadata_page_type
group by DPL.apikey, DPL.session_id, DPL.campaign_id, DPL.visitor_ip, DPL.visitor_site_id, DPL.metadata_page_type,PV.pageviews,PV.visitors,
EE.entry_url,
EE.exit_url;
--USER_DIM:
--Grain: 1 row per user
--Calculated: engaged_user (currently defined as >10 sessions in full data set)
select distinct
--- ids
row_number() OVER (ORDER BY
visitor_ip,
B.visitor_site_id) as user_dim_id,
visitor_ip,
B.visitor_site_id,
case when
num_of_sessions > 10
then 1 else 0 end as engaged_user,
case
when num_of_sessions > 10 then 'Loyalty'
when num_of_sessions > 1 then 'Returning'
else 'One Time' end as user_type
--- dimensions
from
(select
visitor_site_id,
count(distinct session_id) as num_of_sessions
from parsely_dpl
where action = 'pageview'
group by visitor_site_id) A
join
(select distinct
visitor_site_id,
visitor_ip
from parsely_dpl) B
on A.visitor_site_id = B.visitor_site_id;
--redshift view create
create view user_dim as
select distinct
--- ids
visitor_ip,
B.visitor_site_id,
case when
num_of_sessions > 10
then 1 else 0 end as engaged_user,
case
when num_of_sessions > 10 then 'Loyalty'
when num_of_sessions > 1 then 'Returning'
else 'One Time' end as user_type
--- dimensions
from
(select
visitor_site_id,
count(distinct session_id) as num_of_sessions
from public.demo_mode_dpl
where action = 'pageview'
group by visitor_site_id) A
join
(select distinct
visitor_site_id,
visitor_ip
from public.demo_mode_dpl) B
on A.visitor_site_id = B.visitor_site_id;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment