williamtsoi1/bigquery_slot_utilization.sql

## bigquery_slot_utilization.sql
-- This script will look into your GCP organization's BigQuery usage and provide slot utilization by buckets and minutes.
-- The data gathered will be for the last month (720 hours prior to end_date).
-- Note that this script does not gather slot utilization from existing BigQuery reservations. This is only for on-demand slot utilization.
--
-- Required permissions:
-- You need to have the bigquery.jobs.listAll permission for the GCP organization in order to run this query.
-- Details here: https://cloud.google.com/bigquery/docs/information-schema-jobs-timeline-by-organization#required_permissions
--
-- Instructions:
-- 1. Modify the end_date variable to match the month period that you wish to analyze
-- 2. Change the table `region-US`.INFORMATION_SCHEMA.JOBS_TIMELINE_BY_ORGANIZATION to match the region that you wish to analyze.
--    For example, for Hong Kong region (asia-east2) the table should be `region-asia-east2`.INFORMATION_SCHEMA.JOBS_TIMELINE_BY_ORGANIZATION
--
-- How to interpret the results:
-- The query will return results in this format
-- +--------+---------+
-- | bucket | minutes |
-- +--------+---------+
-- |  100.0 |   25810 |
-- |  200.0 |    7263 |
-- |  300.0 |    3397 |
-- |  400.0 |    2919 |
-- |    ... |     ... |
-- +--------+---------+
--
-- This result means that out of the previous 730 hours (total 43600 minutes), the organization used 0-100 slots in 25810 of these minutes,
-- the organization used 100-200 slots in 7263 of these minutes etc.

DECLARE end_date TIMESTAMP;
SET end_date = "2023-05-03";

WITH
 snapshot_data AS (
 SELECT
   period_start,
   period_slot_ms
 FROM
   `region-US`.INFORMATION_SCHEMA.JOBS_TIMELINE_BY_ORGANIZATION
 WHERE
   period_start BETWEEN TIMESTAMP_SUB(end_date, INTERVAL 730 HOUR)
   AND TIMESTAMP(end_date)
   AND (statement_type != "SCRIPT"
     OR statement_type IS NULL)
   AND period_slot_ms > 0
   -- for on demand usage
   AND reservation_id IS NULL AND job_type = "QUERY"
   ),
 data_by_time AS (
 SELECT
   TIMESTAMP_TRUNC(period_start, SECOND ) AS usage_time_sec,
   IFNULL(SUM(period_slot_ms) / 1000, 0) AS usage_slot_sec
 FROM
   snapshot_data
 GROUP BY
   usage_time_sec),
 max_per_minute_data_by_time AS (
 SELECT
   TIMESTAMP_TRUNC(usage_time_sec, MINUTE ) AS usage_time,
   MAX(usage_slot_sec) AS max_slot_per_minute,
 FROM
   data_by_time
 GROUP BY
   usage_time ),
 max_per_minute_data_by_time_with_zeros AS (
   SELECT
     usage_time,
     IFNULL(slot_usage.max_slot_per_minute,0) AS max_slot_per_minute
   FROM
     UNNEST(GENERATE_TIMESTAMP_ARRAY(TIMESTAMP_SUB(end_date, INTERVAL 730 HOUR),    TIMESTAMP_SUB(end_date, INTERVAL 1 MINUTE), INTERVAL 1 MINUTE)) AS usage_time
   LEFT JOIN (
     SELECT
       *
     FROM
       max_per_minute_data_by_time ) AS slot_usage
   USING
     (usage_time)),
 all_buckets AS (
 SELECT
   GENERATE_ARRAY(0, MAX(max_slot_per_minute) + 100, 100) AS bucket
 FROM
   max_per_minute_data_by_time )
SELECT
  bucket,
  COUNTIF(max_slot_per_minute <= bucket
    AND max_slot_per_minute > bucket - 100) AS minutes,
FROM
  max_per_minute_data_by_time_with_zeros,
  UNNEST((
    SELECT
      *
    FROM
      all_buckets)) AS bucket
GROUP BY
  bucket
	-- This script will look into your GCP organization's BigQuery usage and provide slot utilization by buckets and minutes.
	-- The data gathered will be for the last month (720 hours prior to end_date).
	-- Note that this script does not gather slot utilization from existing BigQuery reservations. This is only for on-demand slot utilization.
	--
	-- Required permissions:
	-- You need to have the bigquery.jobs.listAll permission for the GCP organization in order to run this query.
	-- Details here: https://cloud.google.com/bigquery/docs/information-schema-jobs-timeline-by-organization#required_permissions
	--
	-- Instructions:
	-- 1. Modify the end_date variable to match the month period that you wish to analyze
	-- 2. Change the table `region-US`.INFORMATION_SCHEMA.JOBS_TIMELINE_BY_ORGANIZATION to match the region that you wish to analyze.
	-- For example, for Hong Kong region (asia-east2) the table should be `region-asia-east2`.INFORMATION_SCHEMA.JOBS_TIMELINE_BY_ORGANIZATION
	--
	-- How to interpret the results:
	-- The query will return results in this format
	-- +--------+---------+
	-- \| bucket \| minutes \|
	-- +--------+---------+
	-- \| 100.0 \| 25810 \|
	-- \| 200.0 \| 7263 \|
	-- \| 300.0 \| 3397 \|
	-- \| 400.0 \| 2919 \|
	-- \| ... \| ... \|
	-- +--------+---------+
	--
	-- This result means that out of the previous 730 hours (total 43600 minutes), the organization used 0-100 slots in 25810 of these minutes,
	-- the organization used 100-200 slots in 7263 of these minutes etc.

	DECLARE end_date TIMESTAMP;
	SET end_date = "2023-05-03";

	WITH
	snapshot_data AS (
	SELECT
	period_start,
	period_slot_ms
	FROM
	`region-US`.INFORMATION_SCHEMA.JOBS_TIMELINE_BY_ORGANIZATION
	WHERE
	period_start BETWEEN TIMESTAMP_SUB(end_date, INTERVAL 730 HOUR)
	AND TIMESTAMP(end_date)
	AND (statement_type != "SCRIPT"
	OR statement_type IS NULL)
	AND period_slot_ms > 0
	-- for on demand usage
	AND reservation_id IS NULL AND job_type = "QUERY"
	),
	data_by_time AS (
	SELECT
	TIMESTAMP_TRUNC(period_start, SECOND ) AS usage_time_sec,
	IFNULL(SUM(period_slot_ms) / 1000, 0) AS usage_slot_sec
	FROM
	snapshot_data
	GROUP BY
	usage_time_sec),
	max_per_minute_data_by_time AS (
	SELECT
	TIMESTAMP_TRUNC(usage_time_sec, MINUTE ) AS usage_time,
	MAX(usage_slot_sec) AS max_slot_per_minute,
	FROM
	data_by_time
	GROUP BY
	usage_time ),
	max_per_minute_data_by_time_with_zeros AS (
	SELECT
	usage_time,
	IFNULL(slot_usage.max_slot_per_minute,0) AS max_slot_per_minute
	FROM
	UNNEST(GENERATE_TIMESTAMP_ARRAY(TIMESTAMP_SUB(end_date, INTERVAL 730 HOUR), TIMESTAMP_SUB(end_date, INTERVAL 1 MINUTE), INTERVAL 1 MINUTE)) AS usage_time
	LEFT JOIN (
	SELECT
	*
	FROM
	max_per_minute_data_by_time ) AS slot_usage
	USING
	(usage_time)),
	all_buckets AS (
	SELECT
	GENERATE_ARRAY(0, MAX(max_slot_per_minute) + 100, 100) AS bucket
	FROM
	max_per_minute_data_by_time )
	SELECT
	bucket,
	COUNTIF(max_slot_per_minute <= bucket
	AND max_slot_per_minute > bucket - 100) AS minutes,
	FROM
	max_per_minute_data_by_time_with_zeros,
	UNNEST((
	SELECT
	*
	FROM
	all_buckets)) AS bucket
	GROUP BY
	bucket