Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
#standardSQL
-- Create temporary function to calculate
-- acurracy of Common Air Quality Index calculation
-- There are six pollutants in total. If all pollutant readings
-- are available then accuracy is 100%.
-- The less pollutant readings there are the worse the CAQI calculation accuracy is.
-- This function takes in an array of sensor readings and returns single FLOAT64
CREATE TEMPORARY FUNCTION getCAQIAccuracy (
sensor_reading ARRAY<STRUCT<pollutant STRING, value FLOAT64>>
)
RETURNS FLOAT64
AS ((
SELECT (ARRAY_LENGTH(sensor_reading)/6)*100
));
WITH
-- This is first named subquery inside WITH clause
-- It returns sensor readings from the last 3 hours for a location
-- ARRAY_AGG will create an array that groups all sensor readings
LatestSensorReadings AS (
SELECT EXTRACT(DATE FROM timestamp) AS date, location, latitude, longitude,
city, country, ARRAY_AGG(STRUCT(pollutant, value)) AS sensor_reading
FROM `bigquery-public-data.openaq.global_air_quality`
WHERE averaged_over_in_hours=1.0
AND TIMESTAMP_DIFF(CURRENT_TIMESTAMP(), timestamp, HOUR)<=3
GROUP BY date, location, latitude, longitude, city, country),
-- Second named subquery in WITH clause starts here. Notice the comma ',' above
IndexCalculationGrid AS (
-- Create an array of STRUCT fields and initialize it
-- with index calculation grid values. FLOAT64 fields in the STRUCT represent
-- the lowest value that pollutant needs to have to get specified CAQI level.
-- Highest CAQI level for a location settles the final index value
SELECT *
FROM UNNEST(
ARRAY<STRUCT<
caqi_level INT64,
pm10 FLOAT64,
pm25 FLOAT64,
no2 FLOAT64,
o3 FLOAT64,
co FLOAT64,
so2 FLOAT64
>>[
-- VERY LOW
(0, 0, 0, 0, 0, 0, 0),
-- LOW
(1, 25, 15, 50, 60, 5000, 50),
-- MEDIUM
(2, 50, 30, 100, 120, 7500, 100),
-- HIGH
(3, 90, 55, 200, 180, 10000, 350),
-- VERY HIGH
(4, 180, 110, 400, 240, 20000, 500)
]
)),
-- Third named subquery in WITH clause
CAQIStatus AS (
-- Human readable CAQI levels as UNION of generated rows
SELECT 0 AS level, 'VERY LOW' AS status UNION ALL
SELECT 1 AS level, 'LOW' AS status UNION ALL
SELECT 2 AS level, 'MEDIUM' AS status UNION ALL
SELECT 3 AS level, 'HIGH' AS status UNION ALL
SELECT 4 AS level, 'VERY HIGH' AS status
)
SELECT date, location, latitude, longitude, city, country,
caqi, status, accuracy, sensor_reading
FROM (
-- Subquery inside FROM
SELECT lsr.date, lsr.location, lsr.latitude, lsr.longitude, lsr.city, lsr.country,
-- User Defined Function is called here
getCAQIAccuracy(sensor_reading) AS accuracy, lsr.sensor_reading,
( -- Subquery inside a SELECT statement. Single result of
-- this SELECT will be a column (caqi) in outer query
-- Find CAQI level for each pollutant inside sensor_reading array
-- Maximal CAQI level in each sensor_reading array determines the final index
-- ie. if PM10 level is VERY HIGH but all other pollutants are
-- at LOW then final index is VERY HIGH
SELECT MAX(caqi_level)
FROM IndexCalculationGrid
-- Below we are referencing column that originates from the outer query
JOIN UNNEST(lsr.sensor_reading)
WHERE
CASE WHEN pollutant='pm10' AND value > pm10 THEN true END
OR CASE WHEN pollutant='pm25' AND value > pm25 THEN true END
OR CASE WHEN pollutant='no2' AND value > no2 THEN true END
OR CASE WHEN pollutant='o3' AND value > o3 THEN true END
OR CASE WHEN pollutant='co' AND value > co THEN true END
OR CASE WHEN pollutant='so2' AND value > so2 THEN true END
) AS caqi
FROM LatestSensorReadings AS lsr
)
JOIN CAQIStatus ON (caqi=level)
-- caqi can be null if value of the reading is incorrect (eg. <= 0)
-- and there are no other correct readings in the set
-- We want to eliminate those results
WHERE caqi IS NOT NULL
-- We also want only the results that are 80% or more accurate
-- (ie. based on 5 or more pollutant readings)
AND accuracy > 80
ORDER BY caqi DESC, accuracy DESC
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.