Last active
January 22, 2018 11:54
-
-
Save struski/0b3fdd46d256af28d1dbdcb641c1fc34 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#standardSQL | |
-- Create temporary function to calculate | |
-- acurracy of Common Air Quality Index calculation | |
-- There are six pollutants in total. If all pollutant readings | |
-- are available then accuracy is 100%. | |
-- The less pollutant readings there are the worse the CAQI calculation accuracy is. | |
-- This function takes in an array of sensor readings and returns single FLOAT64 | |
CREATE TEMPORARY FUNCTION getCAQIAccuracy ( | |
sensor_reading ARRAY<STRUCT<pollutant STRING, value FLOAT64>> | |
) | |
RETURNS FLOAT64 | |
AS (( | |
SELECT (ARRAY_LENGTH(sensor_reading)/6)*100 | |
)); | |
WITH | |
-- This is first named subquery inside WITH clause | |
-- It returns sensor readings from the last 3 hours for a location | |
-- ARRAY_AGG will create an array that groups all sensor readings | |
LatestSensorReadings AS ( | |
SELECT EXTRACT(DATE FROM timestamp) AS date, location, latitude, longitude, | |
city, country, ARRAY_AGG(STRUCT(pollutant, value)) AS sensor_reading | |
FROM `bigquery-public-data.openaq.global_air_quality` | |
WHERE averaged_over_in_hours=1.0 | |
AND TIMESTAMP_DIFF(CURRENT_TIMESTAMP(), timestamp, HOUR)<=3 | |
GROUP BY date, location, latitude, longitude, city, country), | |
-- Second named subquery in WITH clause starts here. Notice the comma ',' above | |
IndexCalculationGrid AS ( | |
-- Create an array of STRUCT fields and initialize it | |
-- with index calculation grid values. FLOAT64 fields in the STRUCT represent | |
-- the lowest value that pollutant needs to have to get specified CAQI level. | |
-- Highest CAQI level for a location settles the final index value | |
SELECT * | |
FROM UNNEST( | |
ARRAY<STRUCT< | |
caqi_level INT64, | |
pm10 FLOAT64, | |
pm25 FLOAT64, | |
no2 FLOAT64, | |
o3 FLOAT64, | |
co FLOAT64, | |
so2 FLOAT64 | |
>>[ | |
-- VERY LOW | |
(0, 0, 0, 0, 0, 0, 0), | |
-- LOW | |
(1, 25, 15, 50, 60, 5000, 50), | |
-- MEDIUM | |
(2, 50, 30, 100, 120, 7500, 100), | |
-- HIGH | |
(3, 90, 55, 200, 180, 10000, 350), | |
-- VERY HIGH | |
(4, 180, 110, 400, 240, 20000, 500) | |
] | |
)), | |
-- Third named subquery in WITH clause | |
CAQIStatus AS ( | |
-- Human readable CAQI levels as UNION of generated rows | |
SELECT 0 AS level, 'VERY LOW' AS status UNION ALL | |
SELECT 1 AS level, 'LOW' AS status UNION ALL | |
SELECT 2 AS level, 'MEDIUM' AS status UNION ALL | |
SELECT 3 AS level, 'HIGH' AS status UNION ALL | |
SELECT 4 AS level, 'VERY HIGH' AS status | |
) | |
SELECT date, location, latitude, longitude, city, country, | |
caqi, status, accuracy, sensor_reading | |
FROM ( | |
-- Subquery inside FROM | |
SELECT lsr.date, lsr.location, lsr.latitude, lsr.longitude, lsr.city, lsr.country, | |
-- User Defined Function is called here | |
getCAQIAccuracy(sensor_reading) AS accuracy, lsr.sensor_reading, | |
( -- Subquery inside a SELECT statement. Single result of | |
-- this SELECT will be a column (caqi) in outer query | |
-- Find CAQI level for each pollutant inside sensor_reading array | |
-- Maximal CAQI level in each sensor_reading array determines the final index | |
-- ie. if PM10 level is VERY HIGH but all other pollutants are | |
-- at LOW then final index is VERY HIGH | |
SELECT MAX(caqi_level) | |
FROM IndexCalculationGrid | |
-- Below we are referencing column that originates from the outer query | |
JOIN UNNEST(lsr.sensor_reading) | |
WHERE | |
CASE WHEN pollutant='pm10' AND value > pm10 THEN true END | |
OR CASE WHEN pollutant='pm25' AND value > pm25 THEN true END | |
OR CASE WHEN pollutant='no2' AND value > no2 THEN true END | |
OR CASE WHEN pollutant='o3' AND value > o3 THEN true END | |
OR CASE WHEN pollutant='co' AND value > co THEN true END | |
OR CASE WHEN pollutant='so2' AND value > so2 THEN true END | |
) AS caqi | |
FROM LatestSensorReadings AS lsr | |
) | |
JOIN CAQIStatus ON (caqi=level) | |
-- caqi can be null if value of the reading is incorrect (eg. <= 0) | |
-- and there are no other correct readings in the set | |
-- We want to eliminate those results | |
WHERE caqi IS NOT NULL | |
-- We also want only the results that are 80% or more accurate | |
-- (ie. based on 5 or more pollutant readings) | |
AND accuracy > 80 | |
ORDER BY caqi DESC, accuracy DESC |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment