Skip to content

Instantly share code, notes, and snippets.

@johnnyaug
Last active November 8, 2020 14:40
Show Gist options
  • Save johnnyaug/f0cd7805b54228d237af0f33d0f37942 to your computer and use it in GitHub Desktop.
Save johnnyaug/f0cd7805b54228d237af0f33d0f37942 to your computer and use it in GitHub Desktop.
-- Athena queries to share inventory stats with the Treeverse team.
-- The result of these queries will help us a great deal in engineering lakeFS, without exposing any data or object names.
-- Thank you for your effort and help!
-- It is assumed that you have a table named inventory_tbl representing the inventory.
-- A CREATE statement for such a table is available here: https://docs.aws.amazon.com/AmazonS3/latest/dev/storage-inventory.html#storage-inventory-athena-query
-- It is also assumed that Object-Level versioning is enabled on the bucket, and on the inventory (i.e. the inventory contains is_latest as a field).
-- If you don't have Object-Level versioning, please contact us and we will provide an alternative set of queries.
-- Replace the dates with the dates with the ones you intend to share with us (NOTE: they appear in three queries).
-- Replace inventory_tbl with the name of the inventory table, if it is different.
-- Run these queries on Athena, and share with us the result of the final SELECT query.
CREATE table tmp_lakefs_sample AS (
SELECT dt, key, row_number() over(PARTITION BY dt ORDER BY key) AS block_num
FROM inventory_tbl TABLESAMPLE BERNOULLI(0.02)
WHERE dt between '2020-10-20-00-00' and '2020-10-26-00-00' AND is_latest
);
CREATE table tmp_lakefs_blocks WITH (partitioned_by = ARRAY['dt']) AS (
SELECT t1.key AS min_key, t2.key as max_key, t1.block_num, t1.dt
FROM tmp_lakefs_sample t1 JOIN tmp_lakefs_sample t2 ON t1.dt = t2.dt AND t1.block_num + 1 = t2.block_num
GROUP BY t1.dt, t1.key, t2.key, t1.block_num
ORDER BY t1.block_num, 2, 3
);
CREATE table tmp_lakefs_inventory_times AS (
SELECT dt, MAX(last_modified_date) AS inventory_time
FROM inventory_tbl s
WHERE dt between '2020-10-20-00-00' and '2020-10-26-00-00'
GROUP BY 1
);
CREATE table tmp_lakefs_recently_modified
WITH (partitioned_by = ARRAY['dt'])
AS (
SELECT key,s.dt
FROM inventory_tbl s JOIN tmp_lakefs_inventory_times it
ON it.dt = s.dt
WHERE s.last_modified_date > it.inventory_time - interval '1' day AND is_latest
);
-- Send us the results of the following query:
SELECT dt,block_num,EXISTS(
SELECT key FROM tmp_lakefs_recently_modified s WHERE b.dt = s.dt AND s.key >= b.min_key AND s.key < b.max_key
) AS recently_modified
FROM tmp_lakefs_blocks b;
-- Please also send the results of the following query:
SELECT *, ROW_NUMBER() OVER(PARTITION BY dt ORDER BY changes DESC) AS rnk
FROM (
SELECT s.dt, last_modified_date, COUNT(*) AS changes
FROM inventory_tbl s JOIN tmp_lakefs_inventory_times it ON it.dt = s.dt
WHERE s.last_modified_date > it.inventory_time - interval '1' day
AND s.dt BETWEEN '2020-10-20-00-00' AND '2020-10-26-00-00'
GROUP BY 1,2
);
-- Drop all tables
DROP TABLE tmp_lakefs_sample;
DROP TABLE tmp_lakefs_blocks;
DROP TABLE tmp_lakefs_recently_modified;
DROP TABLE tmp_lakefs_inventory_times;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment