struski/bigquery-air-pollution-caqi-index-openaq.sql

## bigquery-air-pollution-caqi-index-openaq.sql
#standardSQL
-- Create temporary function to calculate
-- acurracy of Common Air Quality Index calculation
-- There are six pollutants in total. If all pollutant readings
-- are available then accuracy is 100%.
-- The less pollutant readings there are the worse the CAQI calculation accuracy is.
-- This function takes in an array of sensor readings and returns single FLOAT64
CREATE TEMPORARY FUNCTION getCAQIAccuracy (
  sensor_reading ARRAY<STRUCT<pollutant STRING, value FLOAT64>>
)
RETURNS FLOAT64
AS ((
  SELECT (ARRAY_LENGTH(sensor_reading)/6)*100
));

WITH
-- This is first named subquery inside WITH clause
-- It returns sensor readings from the last 3 hours for a location
-- ARRAY_AGG will create an array that groups all sensor readings
LatestSensorReadings AS (
  SELECT EXTRACT(DATE FROM timestamp) AS date, location, latitude, longitude,
         city, country, ARRAY_AGG(STRUCT(pollutant, value)) AS sensor_reading
  FROM `bigquery-public-data.openaq.global_air_quality`
  WHERE averaged_over_in_hours=1.0
  AND TIMESTAMP_DIFF(CURRENT_TIMESTAMP(), timestamp, HOUR)<=3
  GROUP BY date, location, latitude, longitude, city, country),
-- Second named subquery in WITH clause starts here. Notice the comma ',' above
IndexCalculationGrid AS (
-- Create an array of STRUCT fields and initialize it
-- with index calculation grid values. FLOAT64 fields in the STRUCT represent
-- the lowest value that pollutant needs to have to get specified CAQI level.
-- Highest CAQI level for a location settles the final index value
  SELECT *
  FROM UNNEST(
  ARRAY<STRUCT<
    caqi_level INT64,
    pm10 FLOAT64,
    pm25 FLOAT64,
    no2 FLOAT64,
    o3 FLOAT64,
    co FLOAT64,
    so2 FLOAT64
    >>[
      -- VERY LOW
      (0, 0, 0, 0, 0, 0, 0),
      -- LOW
      (1, 25, 15, 50, 60, 5000, 50),
      -- MEDIUM
      (2, 50, 30, 100, 120, 7500, 100),
      -- HIGH
      (3, 90, 55, 200, 180, 10000, 350),
      -- VERY HIGH
      (4, 180, 110, 400, 240, 20000, 500)
    ]
)),
-- Third named subquery in WITH clause
CAQIStatus AS (
  -- Human readable CAQI levels as UNION of generated rows
  SELECT 0 AS level, 'VERY LOW' AS status UNION ALL
  SELECT 1 AS level, 'LOW' AS status UNION ALL
  SELECT 2 AS level, 'MEDIUM' AS status UNION ALL
  SELECT 3 AS level, 'HIGH' AS status UNION ALL
  SELECT 4 AS level, 'VERY HIGH' AS status
)

SELECT date, location, latitude, longitude, city, country,
       caqi, status, accuracy, sensor_reading
FROM (
  -- Subquery inside FROM
  SELECT lsr.date, lsr.location, lsr.latitude, lsr.longitude, lsr.city, lsr.country,
	   -- User Defined Function is called here
         getCAQIAccuracy(sensor_reading) AS accuracy, lsr.sensor_reading,
     ( -- Subquery inside a SELECT statement. Single result of
       -- this SELECT will be a column (caqi) in outer query
       -- Find CAQI level for each pollutant inside sensor_reading array
       -- Maximal CAQI level in each sensor_reading array determines the final index
       -- ie. if PM10 level is VERY HIGH but all other pollutants are
       -- at LOW then final index is VERY HIGH
      SELECT MAX(caqi_level)
      FROM IndexCalculationGrid
      -- Below we are referencing column that originates from the outer query
      JOIN UNNEST(lsr.sensor_reading)
      WHERE
        CASE WHEN pollutant='pm10' AND value > pm10 THEN true END
        OR CASE WHEN pollutant='pm25' AND value > pm25 THEN true END
        OR CASE WHEN pollutant='no2' AND value > no2 THEN true END
        OR CASE WHEN pollutant='o3' AND value > o3 THEN true END
        OR CASE WHEN pollutant='co' AND value > co THEN true END
        OR CASE WHEN pollutant='so2' AND value > so2 THEN true END
     ) AS caqi
  FROM LatestSensorReadings AS lsr
)
JOIN CAQIStatus ON (caqi=level)
-- caqi can be null if value of the reading is incorrect (eg. <= 0)
-- and there are no other correct readings in the set
-- We want to eliminate those results
WHERE caqi IS NOT NULL
-- We also want only the results that are 80% or more accurate
-- (ie. based on 5 or more pollutant readings)
AND accuracy > 80
ORDER BY caqi DESC, accuracy DESC
	#standardSQL
	-- Create temporary function to calculate
	-- acurracy of Common Air Quality Index calculation
	-- There are six pollutants in total. If all pollutant readings
	-- are available then accuracy is 100%.
	-- The less pollutant readings there are the worse the CAQI calculation accuracy is.
	-- This function takes in an array of sensor readings and returns single FLOAT64
	CREATE TEMPORARY FUNCTION getCAQIAccuracy (
	sensor_reading ARRAY<STRUCT<pollutant STRING, value FLOAT64>>
	)
	RETURNS FLOAT64
	AS ((
	SELECT (ARRAY_LENGTH(sensor_reading)/6)*100
	));

	WITH
	-- This is first named subquery inside WITH clause
	-- It returns sensor readings from the last 3 hours for a location
	-- ARRAY_AGG will create an array that groups all sensor readings
	LatestSensorReadings AS (
	SELECT EXTRACT(DATE FROM timestamp) AS date, location, latitude, longitude,
	city, country, ARRAY_AGG(STRUCT(pollutant, value)) AS sensor_reading
	FROM `bigquery-public-data.openaq.global_air_quality`
	WHERE averaged_over_in_hours=1.0
	AND TIMESTAMP_DIFF(CURRENT_TIMESTAMP(), timestamp, HOUR)<=3
	GROUP BY date, location, latitude, longitude, city, country),
	-- Second named subquery in WITH clause starts here. Notice the comma ',' above
	IndexCalculationGrid AS (
	-- Create an array of STRUCT fields and initialize it
	-- with index calculation grid values. FLOAT64 fields in the STRUCT represent
	-- the lowest value that pollutant needs to have to get specified CAQI level.
	-- Highest CAQI level for a location settles the final index value
	SELECT *
	FROM UNNEST(
	ARRAY<STRUCT<
	caqi_level INT64,
	pm10 FLOAT64,
	pm25 FLOAT64,
	no2 FLOAT64,
	o3 FLOAT64,
	co FLOAT64,
	so2 FLOAT64
	>>[
	-- VERY LOW
	(0, 0, 0, 0, 0, 0, 0),
	-- LOW
	(1, 25, 15, 50, 60, 5000, 50),
	-- MEDIUM
	(2, 50, 30, 100, 120, 7500, 100),
	-- HIGH
	(3, 90, 55, 200, 180, 10000, 350),
	-- VERY HIGH
	(4, 180, 110, 400, 240, 20000, 500)
	]
	)),
	-- Third named subquery in WITH clause
	CAQIStatus AS (
	-- Human readable CAQI levels as UNION of generated rows
	SELECT 0 AS level, 'VERY LOW' AS status UNION ALL
	SELECT 1 AS level, 'LOW' AS status UNION ALL
	SELECT 2 AS level, 'MEDIUM' AS status UNION ALL
	SELECT 3 AS level, 'HIGH' AS status UNION ALL
	SELECT 4 AS level, 'VERY HIGH' AS status
	)

	SELECT date, location, latitude, longitude, city, country,
	caqi, status, accuracy, sensor_reading
	FROM (
	-- Subquery inside FROM
	SELECT lsr.date, lsr.location, lsr.latitude, lsr.longitude, lsr.city, lsr.country,
	-- User Defined Function is called here
	getCAQIAccuracy(sensor_reading) AS accuracy, lsr.sensor_reading,
	( -- Subquery inside a SELECT statement. Single result of
	-- this SELECT will be a column (caqi) in outer query
	-- Find CAQI level for each pollutant inside sensor_reading array
	-- Maximal CAQI level in each sensor_reading array determines the final index
	-- ie. if PM10 level is VERY HIGH but all other pollutants are
	-- at LOW then final index is VERY HIGH
	SELECT MAX(caqi_level)
	FROM IndexCalculationGrid
	-- Below we are referencing column that originates from the outer query
	JOIN UNNEST(lsr.sensor_reading)
	WHERE
	CASE WHEN pollutant='pm10' AND value > pm10 THEN true END
	OR CASE WHEN pollutant='pm25' AND value > pm25 THEN true END
	OR CASE WHEN pollutant='no2' AND value > no2 THEN true END
	OR CASE WHEN pollutant='o3' AND value > o3 THEN true END
	OR CASE WHEN pollutant='co' AND value > co THEN true END
	OR CASE WHEN pollutant='so2' AND value > so2 THEN true END
	) AS caqi
	FROM LatestSensorReadings AS lsr
	)
	JOIN CAQIStatus ON (caqi=level)
	-- caqi can be null if value of the reading is incorrect (eg. <= 0)
	-- and there are no other correct readings in the set
	-- We want to eliminate those results
	WHERE caqi IS NOT NULL
	-- We also want only the results that are 80% or more accurate
	-- (ie. based on 5 or more pollutant readings)
	AND accuracy > 80
	ORDER BY caqi DESC, accuracy DESC