Created
July 21, 2017 01:58
-
-
Save rachelannelise/922f153d2c47ffb7d5c68b1e1c4cde08 to your computer and use it in GitHub Desktop.
DPL Star Schema Queries
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--CONTENT_DIM: | |
--Grain: 1 row per apikey, post | |
select distinct | |
--- ids | |
row_number() OVER (ORDER BY | |
apikey, | |
metadata_post_id) as content_dim_id, | |
--Redshift query creations | |
create view content_dim as | |
select distinct | |
--- ids | |
apikey, | |
metadata_post_id, | |
--- derived dimensions | |
case | |
when metadata_full_content_word_count >= 4000 then '4,000 or Above' | |
when metadata_full_content_word_count >= 3000 then '3,000 - 3,999' | |
when metadata_full_content_word_count >= 2000 then '2,000 - 2,999' | |
when metadata_full_content_word_count >= 1000 then '1,000 - 1,999' | |
when metadata_full_content_word_count >= 500 then '500 - 999' | |
when metadata_full_content_word_count >= 100 then '100 - 499' | |
else '< 100' end as word_count_buckets, | |
--- dimensions | |
metadata_authors, | |
metadata_full_content_word_count, | |
metadata_page_type, | |
metadata_pub_date_tmsp, | |
metadata_canonical_url, | |
metadata_section, | |
metadata_tags, | |
metadata_title | |
from public.demo_mode_dpl | |
where metadata_post_id is not null; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--SESSION_CONTENT_FACT: | |
--Grain: 1 row per apikey, session, user, post, page type | |
--Metrics: engaged_time, pageviews | |
select | |
--- ids | |
row_number() OVER (ORDER BY DPL.apikey, | |
DPL.session_id, | |
DPL.campaign_id, | |
DPL.visitor_ip, | |
DPL.visitor_site_id, | |
DPL.metadata_post_id, | |
DPL.metadata_page_type) session_content_fact_id, | |
DPL.apikey, | |
DPL.session_id, | |
DPL.campaign_id, | |
DPL.visitor_ip, | |
DPL.visitor_site_id, | |
DPL.metadata_post_id, | |
DPL.metadata_page_type, | |
--- metrics | |
sum(DPL.engaged_time_inc) as engaged_time, | |
PV.pageviews | |
from parsely_dpl DPL | |
join | |
(select | |
apikey, | |
session_id, | |
campaign_id, | |
visitor_ip, | |
visitor_site_id, | |
metadata_page_type, | |
metadata_post_id, | |
count(session_id) as pageviews | |
from parsely_dpl | |
where action='pageview' | |
group by apikey, metadata_post_id, session_id, campaign_id, visitor_ip, visitor_site_id, metadata_page_type)PV | |
on PV.apikey = DPL.apikey | |
and PV.session_id = DPL.session_id | |
and PV.visitor_ip = DPL.visitor_ip | |
and PV.visitor_site_id = DPL.visitor_site_id | |
and PV.metadata_page_type = DPL.metadata_page_type | |
and PV.metadata_post_id = DPL.metadata_post_id | |
group by DPL.apikey, DPL.session_id, DPL.campaign_id, DPL.visitor_ip, DPL.visitor_site_id, DPL.metadata_post_id, DPL.metadata_page_type,PV.pageviews; | |
--redshift view creation | |
create view session_content_fact as | |
select | |
--- ids | |
DPL.apikey, | |
DPL.session_id, | |
DPL.campaign_id, | |
DPL.visitor_ip, | |
DPL.visitor_site_id, | |
DPL.metadata_post_id, | |
DPL.metadata_page_type, | |
--- metrics | |
sum(DPL.engaged_time_inc) as engaged_time, | |
PV.pageviews | |
from public.demo_mode_dpl DPL | |
join | |
(select | |
apikey, | |
session_id, | |
campaign_id, | |
visitor_ip, | |
visitor_site_id, | |
metadata_page_type, | |
metadata_post_id, | |
count(session_id) as pageviews | |
from public.demo_mode_dpl | |
where action='pageview' | |
group by apikey, metadata_post_id, session_id, campaign_id, visitor_ip, visitor_site_id, metadata_page_type)PV | |
on PV.apikey = DPL.apikey | |
and PV.session_id = DPL.session_id | |
and PV.visitor_ip = DPL.visitor_ip | |
and PV.visitor_site_id = DPL.visitor_site_id | |
and PV.metadata_page_type = DPL.metadata_page_type | |
and PV.metadata_post_id = DPL.metadata_post_id | |
group by DPL.apikey, DPL.session_id, DPL.campaign_id, DPL.visitor_ip, DPL.visitor_site_id, DPL.metadata_post_id, DPL.metadata_page_type,PV.pageviews; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--SESSION_DETAIL_DIM: | |
--Grain: 1 row per apikey, session, user, post (both pageviews and heartbeats) | |
select distinct | |
--- ids | |
row_number() OVER (ORDER BY | |
apikey, | |
session_id, | |
campaign_id, | |
event_id, | |
visitor_ip, | |
visitor_site_id, | |
metadata_post_id, | |
action) as session_detail_dim_id, | |
apikey, | |
session_id, | |
campaign_id, | |
event_id, | |
visitor_ip, | |
visitor_site_id, | |
metadata_post_id, | |
--- dimensions | |
action, | |
timestamp_info, | |
timestamp_info_nginx_ms, | |
timestamp_info_override_ms, | |
timestamp_info_pixel_ms, | |
ts_action, | |
ts_session_current, | |
ts_session_last | |
from parsely_dpl | |
where metadata_page_type = 'post' | |
order by visitor_ip, session_id asc; | |
--redshift view creation | |
create view session_detail_dim as | |
select distinct | |
--- ids | |
apikey, | |
session_id, | |
campaign_id, | |
event_id, | |
visitor_ip, | |
visitor_site_id, | |
metadata_post_id, | |
--- dimensions | |
action, | |
timestamp_info, | |
timestamp_info_nginx_ms, | |
timestamp_info_override_ms, | |
timestamp_info_pixel_ms, | |
ts_action, | |
ts_session_current, | |
ts_session_last | |
from public.demo_mode_dpl | |
where metadata_page_type = 'post'; | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--SESSION_DIM: | |
--Grain: 1 row per apikey, session, user (pageviews only) | |
--Redshift view create\ | |
create view session_dim as | |
select distinct | |
--- ids | |
DPL.apikey, | |
DPL.session_id, | |
campaign_id, | |
DPL.event_id, | |
DPL.visitor_ip, | |
DPL.visitor_site_id, | |
DPL.metadata_post_id, | |
--- derived dimensions | |
case | |
when engaged_time > 30 then 'Deep Read' | |
when engaged_time > 10 then 'Read' | |
else 'Skim' end as read_category, | |
datediff(hour, | |
(TIMESTAMP 'epoch' + left(metadata_pub_date_tmsp,10)::bigint * INTERVAL '1 Second '), | |
(TIMESTAMP 'epoch' + left(timestamp_info_nginx_ms,10)::bigint * INTERVAL '1 Second ')) as hours_since_published, | |
datediff(day, | |
(TIMESTAMP 'epoch' + left(metadata_pub_date_tmsp,10)::bigint * INTERVAL '1 Second '), | |
(TIMESTAMP 'epoch' + left(timestamp_info_nginx_ms,10)::bigint * INTERVAL '1 Second ')) as days_since_published, | |
datediff(week, | |
(TIMESTAMP 'epoch' + left(metadata_pub_date_tmsp,10)::bigint * INTERVAL '1 Second '), | |
(TIMESTAMP 'epoch' + left(timestamp_info_nginx_ms,10)::bigint * INTERVAL '1 Second ')) as weeks_since_published, | |
--- dimensions | |
action, | |
ip_city, | |
ip_continent, | |
ip_country, | |
ip_lat, | |
ip_lon, | |
ip_postal, | |
ref_category, | |
ref_clean, | |
ref_domain, | |
ref_netloc, | |
ref_path, | |
referrer, | |
session_initial_referrer, | |
session_initial_url, | |
sref_category, | |
sref_clean, | |
sref_domain, | |
sref_netloc, | |
surl_clean, | |
surl_domain, | |
surl_netloc, | |
ua_browser, | |
ua_device, | |
ua_devicetype, | |
url, | |
url_clean, | |
url_domain | |
from public.demo_mode_dpl DPL | |
join ( | |
select | |
apikey, | |
session_id, | |
visitor_ip, | |
visitor_site_id, | |
metadata_post_id, | |
sum(engaged_time_inc) as engaged_time | |
from public.demo_mode_dpl | |
group by | |
apikey, | |
session_id, | |
visitor_ip, | |
visitor_site_id, | |
metadata_post_id | |
having sum(engaged_time_inc) >0 | |
) E | |
on E.apikey = DPL.apikey | |
and E.session_id = DPL.session_id | |
and E.visitor_ip = DPL.visitor_ip | |
and E.visitor_site_id = DPL.visitor_site_id | |
and E.metadata_post_id = DPL.metadata_post_id | |
where action = 'pageview' | |
and metadata_page_type = 'post'; | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--SESSION_FACT: | |
--Grain: 1 row per apikey, session, user, page type | |
--Metrics: engaged_time, pageviews | |
--Calculated: entry_url, exit_url | |
select | |
--- ids | |
row_number() OVER (ORDER BY DPL.apikey, | |
DPL.session_id, | |
DPL.campaign_id, | |
DPL.visitor_ip, | |
DPL.visitor_site_id, | |
DPL.metadata_page_type) session_fact_id, | |
DPL.apikey, | |
DPL.session_id, | |
DPL.campaign_id, | |
DPL.visitor_ip, | |
DPL.visitor_site_id, | |
DPL.metadata_page_type, | |
--- metrics | |
sum(DPL.engaged_time_inc) as f, | |
PV.pageviews, | |
PV.visitors, | |
EE.entry_url, | |
EE.exit_url, | |
1 as session_counter | |
--- dimensions | |
from parsely_dpl DPL | |
join | |
-- min/max entry page | |
(select distinct | |
SP.apikey, | |
SP.session_id, | |
SP.visitor_ip, | |
SP.visitor_site_id, | |
SPMin.url_clean as entry_url, | |
SPMax.url_clean as exit_url, | |
SP.metadata_page_type | |
from | |
(select distinct | |
apikey, | |
session_id, | |
campaign_id, | |
visitor_ip, | |
visitor_site_id, | |
metadata_page_type, | |
url_clean, | |
timestamp_info_nginx_ms | |
from parsely_dpl) SPMin | |
join | |
(select | |
apikey, | |
session_id, | |
campaign_id, | |
visitor_ip, | |
visitor_site_id, | |
metadata_page_type, | |
min(timestamp_info_nginx_ms) as session_min_timestamp, | |
max(timestamp_info_nginx_ms) as session_max_timestamp | |
from parsely_dpl | |
group by apikey, session_id, campaign_id, visitor_ip, visitor_site_id, metadata_page_type) SP | |
on | |
(SPMin.timestamp_info_nginx_ms = SP.session_min_timestamp) | |
and SP.session_id = SPMin.session_id | |
and SP.visitor_site_id = SPMin.visitor_site_id | |
and SP.visitor_ip = SPMin.visitor_ip | |
and SP.metadata_page_type = SPMin.metadata_page_type | |
and SP.apikey = SPMin.apikey | |
join | |
(select distinct | |
apikey, | |
session_id, | |
campaign_id, | |
visitor_ip, | |
visitor_site_id, | |
metadata_page_type, | |
url_clean, | |
timestamp_info_nginx_ms | |
from parsely_dpl) SPMax | |
on | |
(SPMax.timestamp_info_nginx_ms = SP.session_max_timestamp) | |
and SP.session_id = SPMax.session_id | |
and SP.visitor_site_id = SPMax.visitor_site_id | |
and SP.visitor_ip = SPMax.visitor_ip | |
and SP.metadata_page_type = SPMax.metadata_page_type | |
and SP.apikey = SPMax.apikey | |
order by SP.visitor_site_id) EE | |
on EE.visitor_ip = DPL.visitor_ip | |
and EE.session_id = DPL.session_id | |
and EE.apikey = DPL.apikey | |
and EE.visitor_site_id = DPL.visitor_site_id | |
and EE.apikey = DPL.apikey | |
and EE.metadata_page_type = DPL.metadata_page_type | |
join | |
(select | |
apikey, | |
session_id, | |
campaign_id, | |
visitor_ip, | |
visitor_site_id, | |
metadata_page_type, | |
count(session_id) as pageviews, | |
count(distinct visitor_site_id) as visitors | |
from parsely_dpl | |
where action='pageview' | |
group by apikey, session_id, campaign_id, visitor_ip, visitor_site_id, metadata_page_type)PV | |
on PV.apikey = DPL.apikey | |
and PV.session_id = DPL.session_id | |
and PV.visitor_ip = DPL.visitor_ip | |
and PV.visitor_site_id = DPL.visitor_site_id | |
and PV.metadata_page_type = DPL.metadata_page_type | |
group by DPL.apikey, DPL.session_id, DPL.campaign_id, DPL.visitor_ip, DPL.visitor_site_id, DPL.metadata_page_type,PV.pageviews,PV.visitors, | |
EE.entry_url, | |
EE.exit_url; | |
--Redshift view create | |
create view session_fact as | |
select | |
--- ids | |
DPL.apikey, | |
DPL.session_id, | |
DPL.campaign_id, | |
DPL.visitor_ip, | |
DPL.visitor_site_id, | |
DPL.metadata_page_type, | |
--- metrics | |
sum(DPL.engaged_time_inc) as engaged_time, | |
PV.visitors, | |
EE.entry_url, | |
EE.exit_url, | |
1 as session_counter | |
--- dimensions | |
from public.demo_mode_dpl DPL | |
join | |
-- min/max entry page | |
(select distinct | |
SP.apikey, | |
SP.session_id, | |
SP.visitor_ip, | |
SP.visitor_site_id, | |
SPMin.url_clean as entry_url, | |
SPMax.url_clean as exit_url, | |
SP.metadata_page_type | |
from | |
(select distinct | |
apikey, | |
session_id, | |
campaign_id, | |
visitor_ip, | |
visitor_site_id, | |
metadata_page_type, | |
url_clean, | |
timestamp_info_nginx_ms | |
from public.demo_mode_dpl) SPMin | |
join | |
(select | |
apikey, | |
session_id, | |
campaign_id, | |
visitor_ip, | |
visitor_site_id, | |
metadata_page_type, | |
min(timestamp_info_nginx_ms) as session_min_timestamp, | |
max(timestamp_info_nginx_ms) as session_max_timestamp | |
from public.demo_mode_dpl | |
group by apikey, session_id, campaign_id, visitor_ip, visitor_site_id, metadata_page_type) SP | |
on | |
(SPMin.timestamp_info_nginx_ms = SP.session_min_timestamp) | |
and SP.session_id = SPMin.session_id | |
and SP.visitor_site_id = SPMin.visitor_site_id | |
and SP.visitor_ip = SPMin.visitor_ip | |
and SP.metadata_page_type = SPMin.metadata_page_type | |
and SP.apikey = SPMin.apikey | |
join | |
(select distinct | |
apikey, | |
session_id, | |
campaign_id, | |
visitor_ip, | |
visitor_site_id, | |
metadata_page_type, | |
url_clean, | |
timestamp_info_nginx_ms | |
from public.demo_mode_dpl) SPMax | |
on | |
(SPMax.timestamp_info_nginx_ms = SP.session_max_timestamp) | |
and SP.session_id = SPMax.session_id | |
and SP.visitor_site_id = SPMax.visitor_site_id | |
and SP.visitor_ip = SPMax.visitor_ip | |
and SP.metadata_page_type = SPMax.metadata_page_type | |
and SP.apikey = SPMax.apikey | |
order by SP.visitor_site_id) EE | |
on EE.visitor_ip = DPL.visitor_ip | |
and EE.session_id = DPL.session_id | |
and EE.apikey = DPL.apikey | |
and EE.visitor_site_id = DPL.visitor_site_id | |
and EE.apikey = DPL.apikey | |
and EE.metadata_page_type = DPL.metadata_page_type | |
join | |
(select | |
apikey, | |
session_id, | |
campaign_id, | |
visitor_ip, | |
visitor_site_id, | |
metadata_page_type, | |
count(session_id) as pageviews, | |
count(distinct visitor_site_id) as visitors | |
from public.demo_mode_dpl | |
where action='pageview' | |
group by apikey, session_id, campaign_id, visitor_ip, visitor_site_id, metadata_page_type)PV | |
on PV.apikey = DPL.apikey | |
and PV.session_id = DPL.session_id | |
and PV.visitor_ip = DPL.visitor_ip | |
and PV.visitor_site_id = DPL.visitor_site_id | |
and PV.metadata_page_type = DPL.metadata_page_type | |
group by DPL.apikey, DPL.session_id, DPL.campaign_id, DPL.visitor_ip, DPL.visitor_site_id, DPL.metadata_page_type,PV.pageviews,PV.visitors, | |
EE.entry_url, | |
EE.exit_url; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--USER_DIM: | |
--Grain: 1 row per user | |
--Calculated: engaged_user (currently defined as >10 sessions in full data set) | |
select distinct | |
--- ids | |
row_number() OVER (ORDER BY | |
visitor_ip, | |
B.visitor_site_id) as user_dim_id, | |
visitor_ip, | |
B.visitor_site_id, | |
case when | |
num_of_sessions > 10 | |
then 1 else 0 end as engaged_user, | |
case | |
when num_of_sessions > 10 then 'Loyalty' | |
when num_of_sessions > 1 then 'Returning' | |
else 'One Time' end as user_type | |
--- dimensions | |
from | |
(select | |
visitor_site_id, | |
count(distinct session_id) as num_of_sessions | |
from parsely_dpl | |
where action = 'pageview' | |
group by visitor_site_id) A | |
join | |
(select distinct | |
visitor_site_id, | |
visitor_ip | |
from parsely_dpl) B | |
on A.visitor_site_id = B.visitor_site_id; | |
--redshift view create | |
create view user_dim as | |
select distinct | |
--- ids | |
visitor_ip, | |
B.visitor_site_id, | |
case when | |
num_of_sessions > 10 | |
then 1 else 0 end as engaged_user, | |
case | |
when num_of_sessions > 10 then 'Loyalty' | |
when num_of_sessions > 1 then 'Returning' | |
else 'One Time' end as user_type | |
--- dimensions | |
from | |
(select | |
visitor_site_id, | |
count(distinct session_id) as num_of_sessions | |
from public.demo_mode_dpl | |
where action = 'pageview' | |
group by visitor_site_id) A | |
join | |
(select distinct | |
visitor_site_id, | |
visitor_ip | |
from public.demo_mode_dpl) B | |
on A.visitor_site_id = B.visitor_site_id; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment