Skip to content

Instantly share code, notes, and snippets.

@struski
Last active January 22, 2018 11:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save struski/0b3fdd46d256af28d1dbdcb641c1fc34 to your computer and use it in GitHub Desktop.
Save struski/0b3fdd46d256af28d1dbdcb641c1fc34 to your computer and use it in GitHub Desktop.
#standardSQL
-- Create temporary function to calculate
-- acurracy of Common Air Quality Index calculation
-- There are six pollutants in total. If all pollutant readings
-- are available then accuracy is 100%.
-- The less pollutant readings there are the worse the CAQI calculation accuracy is.
-- This function takes in an array of sensor readings and returns single FLOAT64
CREATE TEMPORARY FUNCTION getCAQIAccuracy (
sensor_reading ARRAY<STRUCT<pollutant STRING, value FLOAT64>>
)
RETURNS FLOAT64
AS ((
SELECT (ARRAY_LENGTH(sensor_reading)/6)*100
));
WITH
-- This is first named subquery inside WITH clause
-- It returns sensor readings from the last 3 hours for a location
-- ARRAY_AGG will create an array that groups all sensor readings
LatestSensorReadings AS (
SELECT EXTRACT(DATE FROM timestamp) AS date, location, latitude, longitude,
city, country, ARRAY_AGG(STRUCT(pollutant, value)) AS sensor_reading
FROM `bigquery-public-data.openaq.global_air_quality`
WHERE averaged_over_in_hours=1.0
AND TIMESTAMP_DIFF(CURRENT_TIMESTAMP(), timestamp, HOUR)<=3
GROUP BY date, location, latitude, longitude, city, country),
-- Second named subquery in WITH clause starts here. Notice the comma ',' above
IndexCalculationGrid AS (
-- Create an array of STRUCT fields and initialize it
-- with index calculation grid values. FLOAT64 fields in the STRUCT represent
-- the lowest value that pollutant needs to have to get specified CAQI level.
-- Highest CAQI level for a location settles the final index value
SELECT *
FROM UNNEST(
ARRAY<STRUCT<
caqi_level INT64,
pm10 FLOAT64,
pm25 FLOAT64,
no2 FLOAT64,
o3 FLOAT64,
co FLOAT64,
so2 FLOAT64
>>[
-- VERY LOW
(0, 0, 0, 0, 0, 0, 0),
-- LOW
(1, 25, 15, 50, 60, 5000, 50),
-- MEDIUM
(2, 50, 30, 100, 120, 7500, 100),
-- HIGH
(3, 90, 55, 200, 180, 10000, 350),
-- VERY HIGH
(4, 180, 110, 400, 240, 20000, 500)
]
)),
-- Third named subquery in WITH clause
CAQIStatus AS (
-- Human readable CAQI levels as UNION of generated rows
SELECT 0 AS level, 'VERY LOW' AS status UNION ALL
SELECT 1 AS level, 'LOW' AS status UNION ALL
SELECT 2 AS level, 'MEDIUM' AS status UNION ALL
SELECT 3 AS level, 'HIGH' AS status UNION ALL
SELECT 4 AS level, 'VERY HIGH' AS status
)
SELECT date, location, latitude, longitude, city, country,
caqi, status, accuracy, sensor_reading
FROM (
-- Subquery inside FROM
SELECT lsr.date, lsr.location, lsr.latitude, lsr.longitude, lsr.city, lsr.country,
-- User Defined Function is called here
getCAQIAccuracy(sensor_reading) AS accuracy, lsr.sensor_reading,
( -- Subquery inside a SELECT statement. Single result of
-- this SELECT will be a column (caqi) in outer query
-- Find CAQI level for each pollutant inside sensor_reading array
-- Maximal CAQI level in each sensor_reading array determines the final index
-- ie. if PM10 level is VERY HIGH but all other pollutants are
-- at LOW then final index is VERY HIGH
SELECT MAX(caqi_level)
FROM IndexCalculationGrid
-- Below we are referencing column that originates from the outer query
JOIN UNNEST(lsr.sensor_reading)
WHERE
CASE WHEN pollutant='pm10' AND value > pm10 THEN true END
OR CASE WHEN pollutant='pm25' AND value > pm25 THEN true END
OR CASE WHEN pollutant='no2' AND value > no2 THEN true END
OR CASE WHEN pollutant='o3' AND value > o3 THEN true END
OR CASE WHEN pollutant='co' AND value > co THEN true END
OR CASE WHEN pollutant='so2' AND value > so2 THEN true END
) AS caqi
FROM LatestSensorReadings AS lsr
)
JOIN CAQIStatus ON (caqi=level)
-- caqi can be null if value of the reading is incorrect (eg. <= 0)
-- and there are no other correct readings in the set
-- We want to eliminate those results
WHERE caqi IS NOT NULL
-- We also want only the results that are 80% or more accurate
-- (ie. based on 5 or more pollutant readings)
AND accuracy > 80
ORDER BY caqi DESC, accuracy DESC
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment