Skip to content

Instantly share code, notes, and snippets.

@atharrison
Last active December 23, 2015 08:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save atharrison/6611314 to your computer and use it in GitHub Desktop.
Save atharrison/6611314 to your computer and use it in GitHub Desktop.
Using Apache Hive to generate Histogram data
-- TODO Cleanup, add documentation
select difference, count(distinct difference) from (
select size(split(subject, '[ ]')) - size(split(subject, '[ ]')) as difference from readrate where day=YYYYMMDD
) t1 group by difference order by difference limit 10;
select difference, count(difference) as total from (
select size(split(decoded_subject, '[ ]')) - size(split(subject, '[ ]')) as difference from readrate where day=YYYYMMDD limit 10) t1
group by difference;
--select difference, count(difference) as total from ( select size(split(decoded_subject, '[ ]')) - size(split(subject, '[ ]')) as difference, subject, decoded_subject from readrate where day=20130917)
-- t1
-- group by subject, decoded_subject, difference;
select distinct t2.difference, count(t2.difference), sum(t2.total) from (
select difference, count(difference) as total from ( select size(split(decoded_subject, '[ ]')) - size(split(subject, '[ ]')) as difference, subject, decoded_subject from readrate where day=YYYYMMDD)
t1
group by subject, decoded_subject, difference) t2 group by t2.difference;
CREATE EXTERNAL TABLE diff_count (
difference INT,
count INT
)
PARTITIONED BY (day int)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n';
ALTER TABLE diff_count ADD PARTITION (day=YYYYMMDD) LOCATION 's3://bucket/date/data/';
select distinct difference, sum(count) from diff_count;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment