Skip to content

Instantly share code, notes, and snippets.

@robrich
Created October 12, 2020 19:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save robrich/462c31f5861628be8b571c8d5ee4726c to your computer and use it in GitHub Desktop.
Save robrich/462c31f5861628be8b571c8d5ee4726c to your computer and use it in GitHub Desktop.
time-series.sql
-- TIME SERIES
-- ===========
-- setup schema
CREATE DATABASE temp_history;
USE temp_history;
CREATE TABLE temperatures (
location VARCHAR(200) NOT NULL,
read_date DATETIME(6) NOT NULL,
latitude DOUBLE,
longitude DOUBLE,
temperatureF DOUBLE,
KEY (read_date, location) USING CLUSTERED COLUMNSTORE
);
-- load values
-- Thank you to https://www.ncdc.noaa.gov/ for 2010's data
CREATE PIPELINE temperatures
AS LOAD DATA FS '/vagrant/noaa-weather-data.txt'
INTO TABLE temperatures
FORMAT CSV
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
IGNORE 1 LINES;
-- start pipeline
TEST PIPELINE temperatures LIMIT 10;
START PIPELINE temperatures FOREGROUND LIMIT 1 BATCHES;
START PIPELINE temperatures;
-- verify pipeline
SELECT * FROM temperatures;
SELECT count(*) FROM temperatures;
SELECT * FROM information_schema.PIPELINES_BATCHES_SUMMARY;
SELECT location, count(*) FROM temperatures GROUP BY 1;
-- Average temperature all year
SELECT location, ROUND(AVG(temperatureF),1) as 'avg temp F' FROM temperatures GROUP BY location;
-- Average temperature by day
SELECT location, read_date :> date,
ROUND(AVG(temperatureF), 1) as 'avg temp F', MIN(temperatureF), MAX(temperatureF)
FROM temperatures
GROUP by 1, 2
ORDER BY 1, 2;
-- Average weekly temperature in March
SELECT location, TIME_BUCKET("7d", read_date) as 'week',
ROUND(AVG(temperatureF), 1) as 'avg temp F', MIN(temperatureF), MAX(temperatureF)
FROM temperatures
WHERE read_date >= '2010-03-01' AND read_date <= '2010-03-31'
GROUP BY 1, 2 ORDER BY 1, 2;
-- Candlestick chart for July
WITH ranked AS (
SELECT location, read_date,
RANK() OVER w as r,
MIN(temperatureF) over w as 'min',
MAX(temperatureF) over w as 'max',
FIRST_VALUE(temperatureF) over w as 'first',
LAST_VALUE(temperatureF) over w as 'last'
FROM temperatures
WINDOW w AS (
PARTITION BY location, time_bucket('1d', read_date)
ORDER BY read_date
ROWS BETWEEN UNBOUNDED PRECEDING
AND UNBOUNDED FOLLOWING
)
)
SELECT location, time_bucket('1d', read_date) as 'date',
min, max, first, last
FROM ranked
WHERE r = 1
AND (read_date >= '2010-07-01' AND read_date <= '2010-07-31')
ORDER BY 1, 2;
-- smoothing: average 3 preceeding rows into current row
SELECT location, read_date, temperatureF,
AVG(temperatureF) OVER (ORDER BY location, read_date ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS smoothed_temp
FROM temperatures
WHERE (read_date >= '2010-03-01' AND read_date <= '2010-03-31')
ORDER BY 1, 2;
-- Cleanup
STOP PIPELINE temperatures;
DROP PIPELINE temperatures;
DROP TABLE temperatures;
DROP DATABASE temp_history;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment