Last active
December 23, 2015 08:59
-
-
Save atharrison/6611314 to your computer and use it in GitHub Desktop.
Using Apache Hive to generate Histogram data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- TODO Cleanup, add documentation | |
select difference, count(distinct difference) from ( | |
select size(split(subject, '[ ]')) - size(split(subject, '[ ]')) as difference from readrate where day=YYYYMMDD | |
) t1 group by difference order by difference limit 10; | |
select difference, count(difference) as total from ( | |
select size(split(decoded_subject, '[ ]')) - size(split(subject, '[ ]')) as difference from readrate where day=YYYYMMDD limit 10) t1 | |
group by difference; | |
--select difference, count(difference) as total from ( select size(split(decoded_subject, '[ ]')) - size(split(subject, '[ ]')) as difference, subject, decoded_subject from readrate where day=20130917) | |
-- t1 | |
-- group by subject, decoded_subject, difference; | |
select distinct t2.difference, count(t2.difference), sum(t2.total) from ( | |
select difference, count(difference) as total from ( select size(split(decoded_subject, '[ ]')) - size(split(subject, '[ ]')) as difference, subject, decoded_subject from readrate where day=YYYYMMDD) | |
t1 | |
group by subject, decoded_subject, difference) t2 group by t2.difference; | |
CREATE EXTERNAL TABLE diff_count ( | |
difference INT, | |
count INT | |
) | |
PARTITIONED BY (day int) | |
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' | |
LINES TERMINATED BY '\n'; | |
ALTER TABLE diff_count ADD PARTITION (day=YYYYMMDD) LOCATION 's3://bucket/date/data/'; | |
select distinct difference, sum(count) from diff_count; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment