Skip to content

Instantly share code, notes, and snippets.

@vitalibertas
Created September 28, 2017 19:27
Show Gist options
  • Save vitalibertas/4eff16e088aca0122d8c167c7977c4c4 to your computer and use it in GitHub Desktop.
Save vitalibertas/4eff16e088aca0122d8c167c7977c4c4 to your computer and use it in GitHub Desktop.
Hive row_number() in place of aggregate to determine the maximum event time for each ID per day.
SET hive.execution.engine = mr;
SET hive.support.concurrency = false;
SET hive.exec.parallel = true;
SET hive.exec.dynamic.partition.mode=nonstrict;
USE hosting_stats;
WITH Rank AS (
SELECT
cid
,event_time
,storage_bytes_consumed
,storage_device
,storage_type
,year
,month
,day
,row_number() OVER (PARTITION BY cid ORDER BY event_time DESC) AS row_num
FROM
hosting_storage
WHERE
year = ${year}
AND month = ${month}
AND day = ${day}
)
INSERT OVERWRITE TABLE mwp_storage PARTITION (year, month, day)
SELECT
cid
,event_time
,storage_bytes_consumed
,storage_device
,storage_type
,year
,month
,day
FROM
Rank
WHERE
row_num = 1
;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment