Skip to content

Instantly share code, notes, and snippets.

@vitalibertas
Last active August 13, 2017 04:10
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save vitalibertas/3653fcf459647ca533dc81e8edf69dd5 to your computer and use it in GitHub Desktop.
Hive query that uses arrays, aggregates, and windowing to determine customer onboarding category.
WITH Landing AS (
SELECT
visit_id
,COLLECT_SET(shopper_id) AS shopper_array
,MIN(sequence) AS min_sequence
FROM
visits
WHERE
page_type = 'landing'
GROUP BY
visit_id
), Rank AS (
SELECT
vpr.visit_id
,vpr.visitorguid
,vpr.visitdate
,l.shopper_array
,vpr.pagerequest_id
,vpr.page_type
,vpr.page_subject
,vpr.isccode
,row_number() OVER (PARTITION BY vpr.visit_id ORDER BY page_rank DESC, sequence) AS row_number
FROM
Landing AS l
JOIN
visits AS vpr
ON l.visit_id = vpr.visit_id
AND vpr.page_type = 'onboarding'
WHERE
vpr.sequence > l.min_sequence
)
INSERT OVERWRITE TABLE signup_visits
SELECT
visit_id
,visitorguid
,visitdate
,shopper_array
,CONCAT(page_subject, '_onboarding') AS onboarding_type
,pagerequest_id
,isccode
FROM
Rank
WHERE
row_number = 1
;
SELECT
visitdate
,shopper_id
,visittracking_id
,visitorguid
,onboarding_type
FROM
signup_visits
LATERAL VIEW explode(shopper_array) shoppers AS shopper_id
;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment