sayle-doit/bq_storage_across_org.sql

## bq_storage_across_org.sql
/*
 *  This query will run across an entire organization looking at tables across every project
 *  and shows how they will compare on compressed and uncompressed storage.
 *
 *  Region Notes:
 *  This query will only read from a single region or multi-region at a time. It's
 *  currently not possible to read this data from across all
 *
 *  By default this reads from the US multi-region, so this might need to be changed if
 *  your data lives elsewhere.
 *
 *  IAM Notes:
 *  This query will be reading multiple projects' and datasets' INFORMATION_SCHEMA views
 *  across the organization (see below notes) so you will need to have global BigQuery
 *  data read or admin rights across the entire organization for this to run successfully.
 *
 *  READ THIS BEFORE RUNNING:
 *  First and foremost if you have a large organization with many projects and datasets then
 *  this query may take a long time to run. Along those same lines if you are running on-demand
 *  this may scan a LOT of data and cost quite a bit of money without a way to calculate the cost
 *  (more on the why below). So I recommend to NOT run this on a larger organization with an
 *  on-demand project. On the inverse if you have a small flat-rate or Editions reservation then
 *  this query may take a while to run or be very slot intensive, so flex slots (if available)
 *  are a good idea to scale up before running this and then scale down after completion.
 *  If running Editions then I would HIGHLY recommend setting a max on the autoscaler as this
 *  query is almost guaranteed to cause the autoscaler to scale up crazily for a time.
 *
 *  Note that currently BigQuery does not have a "TABLES_BY_ORGANIZATION" view in the
 *  INFORMATION_SCHEMA so getting the "real" (read as non-temporary, non-result, and non-external)
 *  table metadata requires reading the TABLES view in each individual project. Due to this
 *  the below query has to use dynamic queries in order to read this data as this is the
 *  only way to do this until Google implements a method to read all tables across the
 *  entire organization.
 *
 *  This means that queries are dynamically generated by the SQL code and then run against
 *  every project in the organization. This will mean that there is not a way to estimate the
 *  bytes processed by this query and also gives the risk of generated code being ran. Due to
 *  this I HIGHLY recommend reading the below code before running this so you can understand
 *  what is being run against your datasets and to know this code is safe. This also means that
 *  the BigQuery Autoscaler (if used) will not be able to predict the execution of this query
 *  very well and will scale up to the max slot values till it figures itself out.
 *
 *  While I highly am against using dynamically generated code inside of SQL as well as using
 *  loops, in this case there was unfortunately not another way I could come up with to do this.
 *  So I took some liberties on this code I wouldn't usually do which is why it is HIGHLY
 *  imperative to read this code before executing it so you know exactly what it's doing as
 *  it's not something most SQL coders would do on a regular basis.
 *
 *  Output Notes:
 *  This will spit out quite a few result sets. The only one that will be used is the very
 *  last result set. It will start with: "SELECT tsbo.project_id, table_schema AS dataset_id"
 *  Just hit "View Results" in the BQ UI on it to see your results.
 */
--
-- Note this

DECLARE union_query STRING DEFAULT "SELECT project_id, dataset_id, table_name FROM (";
DECLARE counter INT64 DEFAULT 0;
DECLARE external_table_query DEFAULT "CREATE TEMPORARY TABLE tables AS (";

-- This builds a table of queries, as a string, querying every single project mentioned in storage_by_organization
-- Must use a temporary table here as CTEs cannot be used with for loops apparently
CREATE OR REPLACE TEMPORARY TABLE queries AS
SELECT CONCAT("SELECT table_catalog AS project_id, table_schema AS dataset_id, table_name FROM `",
  PROJECT_ID,
  "`.`region-us`.INFORMATION_SCHEMA.TABLES AS t WHERE ",
  "TABLE_SCHEMA IN (",
    STRING_AGG(CONCAT("'", TABLE_SCHEMA, "'")), ')',
  " AND TABLE_NAME IN (",
    STRING_AGG(CONCAT("'", TABLE_NAME, "'")), ')') AS query
FROM
  `region-us`.INFORMATION_SCHEMA.TABLE_STORAGE_BY_ORGANIZATION
WHERE table_type <> 'EXTERNAL' AND deleted = FALSE
GROUP BY PROJECT_ID;

--SELECT * FROM queries;

-- Run over every query built above building up a "UNION ALL" statement for them
FOR query_string IN (SELECT query FROM queries)
DO
  -- If not the first query then add UNION ALL onto the query
  IF counter > 0 THEN
    SET union_query = (SELECT CONCAT(union_query, " UNION ALL ", query_string.query));
  ELSE
    SET union_query = (SELECT CONCAT(union_query, query_string.query));
  END IF;

  -- Increment the counter
  SET counter = counter + 1;
END FOR;
-- Add the closing ) characters
SET union_query = (SELECT CONCAT(union_query, "))"));

-- Creates a tempoary table with all "real" tables in it from datasets
EXECUTE IMMEDIATE CONCAT(external_table_query, union_query);

/*
 *  At this point there is a list of all non-temporary, non-external, and non-result tables
 *  that will be joined against the storage_by_organization view for the final list of tables,
 *  sizes, and prices.
 *
 *  This is the final result query to grab results from.
 */
SELECT
  tsbo.project_id,
  table_schema AS dataset_id,
  tsbo.table_name,

  -- Uncompressed bytes
  total_logical_bytes AS total_uncompressed_bytes,
  total_logical_bytes/POW(1024, 3) AS total_logical_gibytes,
  total_logical_bytes/POW(1024, 4) AS total_logical_tibytes,
  active_logical_bytes AS active_uncompressed_bytes,
  active_logical_bytes/POW(1024, 3) AS active_uncompressed_gibytes,
  active_logical_bytes/POW(1024, 4) AS active_uncompressed_tibytes,
  long_term_logical_bytes AS long_term_uncompressed_bytes,
  long_term_logical_bytes/POW(1024, 3) AS long_term_uncompressed_gibytes,
  long_term_logical_bytes/POW(1024, 4) AS long_term_uncompressed_tibytes,

  -- Compressed bytes
  total_physical_bytes AS total_compressed_bytes,
  total_physical_bytes/POW(1024, 3) AS total_compressed_gibytes,
  total_physical_bytes/POW(1024, 4) AS total_compressed_tibytes,

  -- Compression ratios
  SAFE_DIVIDE(total_logical_bytes, total_physical_bytes) AS total_compression_ratio,  -- Defined as uncompressed size/compressed size
  SAFE_DIVIDE(long_term_logical_bytes, long_term_physical_bytes) AS long_term_compression_ratio,
  SAFE_DIVIDE(active_logical_bytes, active_physical_bytes) AS active_compression_ratio,

  -- Pricing
  ROUND(((active_logical_bytes/POW(1024, 3))*0.02) +
    ((long_term_logical_bytes/POW(1024, 3))*0.01), 2) AS total_uncompressed_price,
  ROUND(((active_logical_bytes/POW(1024, 3))*0.02), 2) AS active_uncompressed_price,
  ROUND(((long_term_logical_bytes/POW(1024, 3))*0.01), 2) AS long_term_uncompressed_price,
  ROUND((((active_physical_bytes-time_travel_physical_bytes)/POW(1024, 3))*0.04) +
    ((long_term_physical_bytes/POW(1024, 3))*0.02), 2) AS total_compressed_price,
  ROUND((((active_physical_bytes-time_travel_physical_bytes)/POW(1024, 3))*0.04), 2) AS active_compressed_price,
  ROUND((long_term_physical_bytes/POW(1024, 3))*0.02, 2) AS long_term_compressed_price,
  ROUND((time_travel_physical_bytes/POW(1024, 3))*0.04, 2) AS time_travel_compressed_price,
  ROUND((fail_safe_physical_bytes/POW(1024, 3))*0.04, 2) AS fail_safe_compressed_price
FROM
  `region-us`.INFORMATION_SCHEMA.TABLE_STORAGE_BY_ORGANIZATION AS tsbo
  JOIN
    tables AS t ON
      t.project_id = tsbo.project_id AND
      t.dataset_id = tsbo.table_schema AND
      t.table_name = tsbo.table_name;
	/*
	* This query will run across an entire organization looking at tables across every project
	* and shows how they will compare on compressed and uncompressed storage.
	*
	* Region Notes:
	* This query will only read from a single region or multi-region at a time. It's
	* currently not possible to read this data from across all
	*
	* By default this reads from the US multi-region, so this might need to be changed if
	* your data lives elsewhere.
	*
	* IAM Notes:
	* This query will be reading multiple projects' and datasets' INFORMATION_SCHEMA views
	* across the organization (see below notes) so you will need to have global BigQuery
	* data read or admin rights across the entire organization for this to run successfully.
	*
	* READ THIS BEFORE RUNNING:
	* First and foremost if you have a large organization with many projects and datasets then
	* this query may take a long time to run. Along those same lines if you are running on-demand
	* this may scan a LOT of data and cost quite a bit of money without a way to calculate the cost
	* (more on the why below). So I recommend to NOT run this on a larger organization with an
	* on-demand project. On the inverse if you have a small flat-rate or Editions reservation then
	* this query may take a while to run or be very slot intensive, so flex slots (if available)
	* are a good idea to scale up before running this and then scale down after completion.
	* If running Editions then I would HIGHLY recommend setting a max on the autoscaler as this
	* query is almost guaranteed to cause the autoscaler to scale up crazily for a time.
	*
	* Note that currently BigQuery does not have a "TABLES_BY_ORGANIZATION" view in the
	* INFORMATION_SCHEMA so getting the "real" (read as non-temporary, non-result, and non-external)
	* table metadata requires reading the TABLES view in each individual project. Due to this
	* the below query has to use dynamic queries in order to read this data as this is the
	* only way to do this until Google implements a method to read all tables across the
	* entire organization.
	*
	* This means that queries are dynamically generated by the SQL code and then run against
	* every project in the organization. This will mean that there is not a way to estimate the
	* bytes processed by this query and also gives the risk of generated code being ran. Due to
	* this I HIGHLY recommend reading the below code before running this so you can understand
	* what is being run against your datasets and to know this code is safe. This also means that
	* the BigQuery Autoscaler (if used) will not be able to predict the execution of this query
	* very well and will scale up to the max slot values till it figures itself out.
	*
	* While I highly am against using dynamically generated code inside of SQL as well as using
	* loops, in this case there was unfortunately not another way I could come up with to do this.
	* So I took some liberties on this code I wouldn't usually do which is why it is HIGHLY
	* imperative to read this code before executing it so you know exactly what it's doing as
	* it's not something most SQL coders would do on a regular basis.
	*
	* Output Notes:
	* This will spit out quite a few result sets. The only one that will be used is the very
	* last result set. It will start with: "SELECT tsbo.project_id, table_schema AS dataset_id"
	* Just hit "View Results" in the BQ UI on it to see your results.
	*/
	--
	-- Note this

	DECLARE union_query STRING DEFAULT "SELECT project_id, dataset_id, table_name FROM (";
	DECLARE counter INT64 DEFAULT 0;
	DECLARE external_table_query DEFAULT "CREATE TEMPORARY TABLE tables AS (";

	-- This builds a table of queries, as a string, querying every single project mentioned in storage_by_organization
	-- Must use a temporary table here as CTEs cannot be used with for loops apparently
	CREATE OR REPLACE TEMPORARY TABLE queries AS
	SELECT CONCAT("SELECT table_catalog AS project_id, table_schema AS dataset_id, table_name FROM `",
	PROJECT_ID,
	"`.`region-us`.INFORMATION_SCHEMA.TABLES AS t WHERE ",
	"TABLE_SCHEMA IN (",
	STRING_AGG(CONCAT("'", TABLE_SCHEMA, "'")), ')',
	" AND TABLE_NAME IN (",
	STRING_AGG(CONCAT("'", TABLE_NAME, "'")), ')') AS query
	FROM
	`region-us`.INFORMATION_SCHEMA.TABLE_STORAGE_BY_ORGANIZATION
	WHERE table_type <> 'EXTERNAL' AND deleted = FALSE
	GROUP BY PROJECT_ID;

	--SELECT * FROM queries;

	-- Run over every query built above building up a "UNION ALL" statement for them
	FOR query_string IN (SELECT query FROM queries)
	DO
	-- If not the first query then add UNION ALL onto the query
	IF counter > 0 THEN
	SET union_query = (SELECT CONCAT(union_query, " UNION ALL ", query_string.query));
	ELSE
	SET union_query = (SELECT CONCAT(union_query, query_string.query));
	END IF;

	-- Increment the counter
	SET counter = counter + 1;
	END FOR;
	-- Add the closing ) characters
	SET union_query = (SELECT CONCAT(union_query, "))"));

	-- Creates a tempoary table with all "real" tables in it from datasets
	EXECUTE IMMEDIATE CONCAT(external_table_query, union_query);

	/*
	* At this point there is a list of all non-temporary, non-external, and non-result tables
	* that will be joined against the storage_by_organization view for the final list of tables,
	* sizes, and prices.
	*
	* This is the final result query to grab results from.
	*/
	SELECT
	tsbo.project_id,
	table_schema AS dataset_id,
	tsbo.table_name,

	-- Uncompressed bytes
	total_logical_bytes AS total_uncompressed_bytes,
	total_logical_bytes/POW(1024, 3) AS total_logical_gibytes,
	total_logical_bytes/POW(1024, 4) AS total_logical_tibytes,
	active_logical_bytes AS active_uncompressed_bytes,
	active_logical_bytes/POW(1024, 3) AS active_uncompressed_gibytes,
	active_logical_bytes/POW(1024, 4) AS active_uncompressed_tibytes,
	long_term_logical_bytes AS long_term_uncompressed_bytes,
	long_term_logical_bytes/POW(1024, 3) AS long_term_uncompressed_gibytes,
	long_term_logical_bytes/POW(1024, 4) AS long_term_uncompressed_tibytes,

	-- Compressed bytes
	total_physical_bytes AS total_compressed_bytes,
	total_physical_bytes/POW(1024, 3) AS total_compressed_gibytes,
	total_physical_bytes/POW(1024, 4) AS total_compressed_tibytes,

	-- Compression ratios
	SAFE_DIVIDE(total_logical_bytes, total_physical_bytes) AS total_compression_ratio, -- Defined as uncompressed size/compressed size
	SAFE_DIVIDE(long_term_logical_bytes, long_term_physical_bytes) AS long_term_compression_ratio,
	SAFE_DIVIDE(active_logical_bytes, active_physical_bytes) AS active_compression_ratio,

	-- Pricing
	ROUND(((active_logical_bytes/POW(1024, 3))*0.02) +
	((long_term_logical_bytes/POW(1024, 3))*0.01), 2) AS total_uncompressed_price,
	ROUND(((active_logical_bytes/POW(1024, 3))*0.02), 2) AS active_uncompressed_price,
	ROUND(((long_term_logical_bytes/POW(1024, 3))*0.01), 2) AS long_term_uncompressed_price,
	ROUND((((active_physical_bytes-time_travel_physical_bytes)/POW(1024, 3))*0.04) +
	((long_term_physical_bytes/POW(1024, 3))*0.02), 2) AS total_compressed_price,
	ROUND((((active_physical_bytes-time_travel_physical_bytes)/POW(1024, 3))*0.04), 2) AS active_compressed_price,
	ROUND((long_term_physical_bytes/POW(1024, 3))*0.02, 2) AS long_term_compressed_price,
	ROUND((time_travel_physical_bytes/POW(1024, 3))*0.04, 2) AS time_travel_compressed_price,
	ROUND((fail_safe_physical_bytes/POW(1024, 3))*0.04, 2) AS fail_safe_compressed_price
	FROM
	`region-us`.INFORMATION_SCHEMA.TABLE_STORAGE_BY_ORGANIZATION AS tsbo
	JOIN
	tables AS t ON
	t.project_id = tsbo.project_id AND
	t.dataset_id = tsbo.table_schema AND
	t.table_name = tsbo.table_name;