Skip to content

Instantly share code, notes, and snippets.

@rudibroekhuizen
Forked from joelonsql/hyperloglog-demo.sql
Created December 14, 2022 09:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rudibroekhuizen/11defea4337dee8933e3ec0b91378f6f to your computer and use it in GitHub Desktop.
Save rudibroekhuizen/11defea4337dee8933e3ec0b91378f6f to your computer and use it in GitHub Desktop.
HyperLogLog demo
--
-- Here comes some general advise on how to use HyperLogLog
-- for someone who wants to implement a YouTube-like service
-- using the awesome https://github.com/citusdata/postgresql-hll PostgreSQL extension
--
CREATE TABLE counter (
id text NOT NULL,
sketch hll NOT NULL DEFAULT hll_empty(),
PRIMARY KEY (id)
);
-- register new thing to count
INSERT INTO counter (id) VALUES ('my_cool_cat_video');
--
-- get the HLL-sketch datastructure
-- and also estimate the cardinality (=number of unique views) for it
--
-- read-only operation, thus horizontally scalable
--
SELECT
sketch,
hll_cardinality(sketch)
FROM counter WHERE id = 'my_cool_cat_video';
--
-- each time the video is watched,
-- the client should mutate the sketch-data structure
--
SELECT hll_add([current sketch value], hll_hash_bigint([the userid that watched the video]));
--
-- if the new value returned by hll_add() is different than the [current sketch value],
-- which will happen more and more infrequent over time, thus it scales well,
-- this will cause a write operation to the database:
--
UPDATE counter SET sketch = [new sketch value] WHERE id = 'my_cool_cat_video';
--
-- the hll_add(...,hll_hash_bigint()) is read-only can be computed by any PostgreSQL instance,
-- running anywhere, and doesn't need to run on the same instance that keeps the actual
-- value to update, thus is scales horizontally.
--
--
-- Example:
--
INSERT INTO counter (id) VALUES ('my_cool_cat_video');
SELECT
sketch,
hll_cardinality(sketch)
FROM counter WHERE id = 'my_cool_cat_video';
/*
sketch | hll_cardinality
----------+-----------------
\x118b7f | 0
(1 row)
*/
SELECT hll_add('\x118b7f'::hll, hll_hash_bigint(12345));
/*
hll_add
--------------------------
\x128b7f3cd2dbd3ee832997
(1 row)
*/
SELECT hll_cardinality('\x128b7f3cd2dbd3ee832997'::hll);
/*
hll_cardinality
-----------------
1
(1 row)
*/
--
-- same user watches same video again, value is unchanged:
--
SELECT hll_add('\x128b7f3cd2dbd3ee832997'::hll, hll_hash_bigint(12345));
/*
hll_add
--------------------------
\x128b7f3cd2dbd3ee832997
(1 row)
*/
--
-- some other user watches the same video, value this time changes:
--
SELECT hll_add('\x128b7f3cd2dbd3ee832997'::hll, hll_hash_bigint(54321));
/*
hll_add
------------------------------------------
\x128b7ffdda8f47ee4c2b8f3cd2dbd3ee832997
(1 row)
*/
SELECT hll_cardinality('\x128b7ffdda8f47ee4c2b8f3cd2dbd3ee832997'::hll);
/*
hll_cardinality
-----------------
2
(1 row)
*/
--
-- after thousands of users have watched the video,
-- the counter will not be 100% accurate and will just be an estimation,
-- but a good estimation with predictable error:
-- "The HyperLogLog algorithm is able to estimate cardinalities of > 109 with a typical accuracy (standard error) of 2%, using 1.5 kB of memory."
-- https://en.wikipedia.org/wiki/HyperLogLog
--
--
-- Simulate 10000 users watching the video:
--
DO $$
DECLARE
_sketch hll;
BEGIN
_sketch := hll_empty();
FOR i IN 1..10000 LOOP
_sketch := hll_add(_sketch, hll_hash_bigint(i));
END LOOP;
RAISE NOTICE 'count estimation: %, sketch value: %', hll_cardinality(_sketch), _sketch;
END
$$;
/*
NOTICE: count estimation: 9923.837380245724, sketch value: \x148b7f10c46114621884540c43210681106210c82220a31884539c433088618c6618c4310462214a2210621048310063288822144228c6409044188c120421310a421022194e320cc3110a2088c520c4118d04208a410c23110c31904420862114661846348822188853944320842188832902210c661144220c2510c8329cc119426184683086420825398e208ce311081008e4110e3094a4188832112721882218221108220c652904321083304a51048230c4910cc2120c339ca20104219421194a4190a329127188a413043198a820c8318c25110a3210431908121042194c2091021244208c861906410c812086318c4638086288e221465388851142219c6320c6200ca42008319ca218866418a20886328445210821904808c641086218c8218d023104318082110631206628c8218c6310465408c72104319c63188881904331045190c46106218cc7304812108420c6338c6510c4220c85094251988339043110883144418c6619467210a218442108642088610822208611188318ca331823194c61906708c82104671886220863095842986410c6311442208a641442208432944311863214c209423190a81148228c6418ca73946420c83304e8594c4210642804338883084854104431064290603884318824388243144320ca320844184422142440c4618c6120c23211053086228443208822104610c8518c84288633186229501210441148228842214631946419ce21248311062214c21246231445284e41988508962110c319c432888418c65204453108518ca210c861182721085114a320c8318c8810864110621106228881214e5304a3388c720c81194a211863191471148419cc419cc1190431188210c4118c62108831842732082090412946211c2220c23388c409028188a329468198e011c81290a21188320ce320c42210620142818469111041906438c83314a2214e211c222844520862490a32088120c4510c82110861906229c821884221902295041944310c83194631882811883110c2188a619483108632104210c8121862288441948529081188631886718886198232886220844188821884510c4321065508a510063294641148318843190831b88800c6308c6328c673886311066210631908521c8329465109011086328c421882228ce220c44206012106310863390c62048538c6311084208632842220c4419063190c1190470946510c4418042110840886730c872186228863518433848218c42290e120c62084e51908430cc5308422084268c45210831108228883208e43104320d0310c42120831186210c84198632a4431082130ca6194c328466218c320c0710c83208a12102220c4418c231946221464110a3194a61046728c83110a519464108633886310c6221cc21886321464204631884520d01194620a0a6304c411047204a22908720c22190e4294a2284c4298c538ca730cc51804119885194601186318c630946420c8239862088222186248c413906608c4810c6511483128620a4c619823214641286441c8428461288a32088131c4518c6628c6138cc30882318c432884628c8328c43208c308cc2390431086319ca10a484190a418c242846410c432108618462188a21944528c671206420866288642990420c4318c43388651844110845108843082628864084630906611043190a4090a30844218c263144418486198a3194a228c4308cc2090c310d84288620a0a21048411464108c4108c308c421146511084108431046511044118412146410824
*/
--
-- if we now simulate letting userid 10001 watch the video,
-- it is likely the value will not change, since it only changes
-- for some values, which statistically over time makes a good estimation.
--
-- let's see if the HLL value is unchanged, when we simulate userid 10001 watching the video:
--
SELECT
'\x148b7f10c46114621884540c43210681106210c82220a31884539c433088618c6618c4310462214a2210621048310063288822144228c6409044188c120421310a421022194e320cc3110a2088c520c4118d04208a410c23110c31904420862114661846348822188853944320842188832902210c661144220c2510c8329cc119426184683086420825398e208ce311081008e4110e3094a4188832112721882218221108220c652904321083304a51048230c4910cc2120c339ca20104219421194a4190a329127188a413043198a820c8318c25110a3210431908121042194c2091021244208c861906410c812086318c4638086288e221465388851142219c6320c6200ca42008319ca218866418a20886328445210821904808c641086218c8218d023104318082110631206628c8218c6310465408c72104319c63188881904331045190c46106218cc7304812108420c6338c6510c4220c85094251988339043110883144418c6619467210a218442108642088610822208611188318ca331823194c61906708c82104671886220863095842986410c6311442208a641442208432944311863214c209423190a81148228c6418ca73946420c83304e8594c4210642804338883084854104431064290603884318824388243144320ca320844184422142440c4618c6120c23211053086228443208822104610c8518c84288633186229501210441148228842214631946419ce21248311062214c21246231445284e41988508962110c319c432888418c65204453108518ca210c861182721085114a320c8318c8810864110621106228881214e5304a3388c720c81194a211863191471148419cc419cc1190431188210c4118c62108831842732082090412946211c2220c23388c409028188a329468198e011c81290a21188320ce320c42210620142818469111041906438c83314a2214e211c222844520862490a32088120c4510c82110861906229c821884221902295041944310c83194631882811883110c2188a619483108632104210c8121862288441948529081188631886718886198232886220844188821884510c4321065508a510063294641148318843190831b88800c6308c6328c673886311066210631908521c8329465109011086328c421882228ce220c44206012106310863390c62048538c6311084208632842220c4419063190c1190470946510c4418042110840886730c872186228863518433848218c42290e120c62084e51908430cc5308422084268c45210831108228883208e43104320d0310c42120831186210c84198632a4431082130ca6194c328466218c320c0710c83208a12102220c4418c231946221464110a3194a61046728c83110a519464108633886310c6221cc21886321464204631884520d01194620a0a6304c411047204a22908720c22190e4294a2284c4298c538ca730cc51804119885194601186318c630946420c8239862088222186248c413906608c4810c6511483128620a4c619823214641286441c8428461288a32088131c4518c6628c6138cc30882318c432884628c8328c43208c308cc2390431086319ca10a484190a418c242846410c432108618462188a21944528c671206420866288642990420c4318c43388651844110845108843082628864084630906611043190a4090a30844218c263144418486198a3194a228c4308cc2090c310d84288620a0a21048411464108c4108c308c421146511084108431046511044118412146410824'::hll
=
hll_add
(
'\x148b7f10c46114621884540c43210681106210c82220a31884539c433088618c6618c4310462214a2210621048310063288822144228c6409044188c120421310a421022194e320cc3110a2088c520c4118d04208a410c23110c31904420862114661846348822188853944320842188832902210c661144220c2510c8329cc119426184683086420825398e208ce311081008e4110e3094a4188832112721882218221108220c652904321083304a51048230c4910cc2120c339ca20104219421194a4190a329127188a413043198a820c8318c25110a3210431908121042194c2091021244208c861906410c812086318c4638086288e221465388851142219c6320c6200ca42008319ca218866418a20886328445210821904808c641086218c8218d023104318082110631206628c8218c6310465408c72104319c63188881904331045190c46106218cc7304812108420c6338c6510c4220c85094251988339043110883144418c6619467210a218442108642088610822208611188318ca331823194c61906708c82104671886220863095842986410c6311442208a641442208432944311863214c209423190a81148228c6418ca73946420c83304e8594c4210642804338883084854104431064290603884318824388243144320ca320844184422142440c4618c6120c23211053086228443208822104610c8518c84288633186229501210441148228842214631946419ce21248311062214c21246231445284e41988508962110c319c432888418c65204453108518ca210c861182721085114a320c8318c8810864110621106228881214e5304a3388c720c81194a211863191471148419cc419cc1190431188210c4118c62108831842732082090412946211c2220c23388c409028188a329468198e011c81290a21188320ce320c42210620142818469111041906438c83314a2214e211c222844520862490a32088120c4510c82110861906229c821884221902295041944310c83194631882811883110c2188a619483108632104210c8121862288441948529081188631886718886198232886220844188821884510c4321065508a510063294641148318843190831b88800c6308c6328c673886311066210631908521c8329465109011086328c421882228ce220c44206012106310863390c62048538c6311084208632842220c4419063190c1190470946510c4418042110840886730c872186228863518433848218c42290e120c62084e51908430cc5308422084268c45210831108228883208e43104320d0310c42120831186210c84198632a4431082130ca6194c328466218c320c0710c83208a12102220c4418c231946221464110a3194a61046728c83110a519464108633886310c6221cc21886321464204631884520d01194620a0a6304c411047204a22908720c22190e4294a2284c4298c538ca730cc51804119885194601186318c630946420c8239862088222186248c413906608c4810c6511483128620a4c619823214641286441c8428461288a32088131c4518c6628c6138cc30882318c432884628c8328c43208c308cc2390431086319ca10a484190a418c242846410c432108618462188a21944528c671206420866288642990420c4318c43388651844110845108843082628864084630906611043190a4090a30844218c263144418486198a3194a228c4308cc2090c310d84288620a0a21048411464108c4108c308c421146511084108431046511044118412146410824'::hll,
hll_hash_bigint(10001)
) AS test_if_unchanged
;
/*
test_if_unchanged
-------------------
t
(1 row)
*/
--
-- Yes! We were lucky, the hash of 10001 didn't cause any update of the value,
-- we thus didn't need to update the value in the database, and saved a write-operation this time.
--
--
-- The HyperLogLog extension allows controlling how often such updates occur,
-- and how good estimations is desired, using parameters to the hll_empty() function,
-- see the documentation for details. By default, it keeps exact count for small values,
-- which is probably a good, since it's more notable if there are differences for small values,
-- since a user might know some friend watched their video, but the counter is not updated.
-- But for bigger values, it switches over to estimation, which is probably OK since users
-- won't notice if the value is off by a few values.
--
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment