Skip to content

Instantly share code, notes, and snippets.

@phstudy
Last active May 12, 2021 14:23
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save phstudy/545b43bc3b944ecd06685238913f3c5e to your computer and use it in GitHub Desktop.
Save phstudy/545b43bc3b944ecd06685238913f3c5e to your computer and use it in GitHub Desktop.
hive with data-sketches
# https://datasketches.github.io/docs/Theta/ThetaHiveUDFs.html
# sudo -u hive hadoop fs -mkdir /user/hive/auxjars
# sudo -u hive hadoop fs -copyFromLocal datasketches-hive-0.13.1-SNAPSHOT-with-shaded-core.jar /user/hive/auxjars/
CREATE FUNCTION data2sketch as 'com.yahoo.sketches.hive.theta.DataToSketchUDAF' USING JAR 'hdfs:///user/hive/auxjars/datasketches-hive-0.13.1-SNAPSHOT-with-shaded-core.jar';
CREATE FUNCTION unionSketches as 'com.yahoo.sketches.hive.theta.UnionSketchUDAF' USING JAR 'hdfs:///user/hive/auxjars/datasketches-hive-0.13.1-SNAPSHOT-with-shaded-core.jar';
CREATE FUNCTION estimate as 'com.yahoo.sketches.hive.theta.EstimateSketchUDF' USING JAR 'hdfs:///user/hive/auxjars/datasketches-hive-0.13.1-SNAPSHOT-with-shaded-core.jar';
CREATE FUNCTION union2 as 'com.yahoo.sketches.hive.theta.UnionSketchUDF' USING JAR 'hdfs:///user/hive/auxjars/datasketches-hive-0.13.1-SNAPSHOT-with-shaded-core.jar';
CREATE FUNCTION intersect2 as 'com.yahoo.sketches.hive.theta.IntersectSketchUDF' USING JAR 'hdfs:///user/hive/auxjars/datasketches-hive-0.13.1-SNAPSHOT-with-shaded-core.jar';
CREATE FUNCTION anotb as 'com.yahoo.sketches.hive.theta.ExcludeSketchUDF' USING JAR 'hdfs:///user/hive/auxjars/datasketches-hive-0.13.1-SNAPSHOT-with-shaded-core.jar';
# Building sketches, merging sketches and getting estimates
create temporary table theta_input (id int, category char(1));
insert into table theta_input values
(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'),
(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b');
create temporary table sketch_intermediate (category char(1), sketch binary);
insert into sketch_intermediate select category, data2sketch(id) from theta_input group by category;
select category, estimate(sketch) from sketch_intermediate;
select estimate(unionSketches(sketch)) from sketch_intermediate;
# Set operations
create temporary table sketch_input (id1 int, id2 int);
insert into table sketch_input values
(1, 2), (2, 4), (3, 6), (4, 8), (5, 10), (6, 12), (7, 14), (8, 16), (9, 18), (10, 20);
create temporary table sketch_intermediate2 (sketch1 binary, sketch2 binary);
insert into sketch_intermediate2 select data2sketch(id1), data2sketch(id2) from sketch_input;
select
estimate(sketch1),
estimate(sketch2),
estimate(union2(sketch1, sketch2)),
estimate((intersect2(sketch1, sketch2))),
estimate(anotb(sketch1, sketch2)),
estimate(anotb(sketch2, sketch1))
from sketch_intermediate2;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment