Skip to content

Instantly share code, notes, and snippets.

@sayle-doit
Last active October 31, 2023 13:53
Show Gist options
  • Save sayle-doit/d1d14b951b2b4173e9014cbc93d370c6 to your computer and use it in GitHub Desktop.
Save sayle-doit/d1d14b951b2b4173e9014cbc93d370c6 to your computer and use it in GitHub Desktop.
Query to count out the number of query jobs that reference each table in a project.
--
-- This query will pull out a count of all tables referenced over the past
-- 7 days (configurable) by query jobs in the current project.
-- Note this will pull in queries that query the information schema views.
--
-- Note by default this hits the US multi-region, so if using a different
-- region then change region-us below to that region name.
--
DECLARE start_time TIMESTAMP DEFAULT
-- Change this start time here if wanting to change from 7 days to present
TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 7 DAY);
DECLARE end_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP();
WITH tables AS (
SELECT
jobs.job_id,
tables.*
FROM
`region-us`.INFORMATION_SCHEMA.JOBS AS jobs,
UNNEST(referenced_tables) AS tables
WHERE
-- Only include query jobs and exclude script query jobs.
-- The reason is that script job types have sub-jobs that
-- will be counted as well and lead to skewed data.
job_type = 'QUERY'
AND statement_type <> 'SCRIPT'
-- This line filters out CDC queries that have a type of NULL
-- and job_type of BACKGROUND
AND statement_TYPE IS NOT NULL
),
-- Grab the information schema jobs using some basic regex
info_schema_jobs AS (
SELECT
-- Note doing a distinct here as sometimes this query will pull back
-- duplicate values which skew the final results.
DISTINCT tables.*
-- Doing a lowercase on all of the region dataset names.
-- Sometimes they come out with uppercases in them that hurts
-- being able to filter them later. An example is region-us and
-- region-US are both returned at times.
REPLACE (LOWER(dataset_id) AS dataset_id)
FROM
tables
WHERE
REGEXP_CONTAINS(dataset_id, '^(region-[a-zA-Z]+)$') = TRUE
OR REGEXP_CONTAINS(dataset_id, '^([a-z]+-[a-z]+[0-9]+)$') = TRUE
),
-- Remove all jobs from temporary datasets or the information schema views
real_tables_only AS (
SELECT
tables.*
FROM
tables
JOIN
`region-us`.INFORMATION_SCHEMA.SCHEMATA
ON
schema_name = dataset_id
),
-- Union the "real table" results and the information schema results
all_jobs AS (
SELECT
CONCAT(project_id, '.', dataset_id, '.', table_id) AS table_id
FROM
real_tables_only
UNION ALL
SELECT
CONCAT(project_id, '.', dataset_id, '.', table_id) AS table_id
FROM
info_schema_jobs )
-- Final result that does counting
SELECT
table_id,
COUNT(table_id) AS table_count
FROM
all_jobs
GROUP BY
table_id
ORDER BY
table_count ASC;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment