robrich/time-series.sql

## time-series.sql
-- TIME SERIES
-- ===========

-- setup schema
CREATE DATABASE temp_history;

USE temp_history;

CREATE TABLE temperatures (
  location VARCHAR(200) NOT NULL,
  read_date DATETIME(6) NOT NULL,
  latitude DOUBLE,
  longitude DOUBLE,
  temperatureF DOUBLE,
  KEY (read_date, location) USING CLUSTERED COLUMNSTORE
);

-- load values
-- Thank you to https://www.ncdc.noaa.gov/ for 2010's data
CREATE PIPELINE temperatures
AS LOAD DATA FS '/vagrant/noaa-weather-data.txt'
INTO TABLE temperatures
FORMAT CSV
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
IGNORE 1 LINES;

-- start pipeline
TEST PIPELINE temperatures LIMIT 10;
START PIPELINE temperatures FOREGROUND LIMIT 1 BATCHES;
START PIPELINE temperatures;

-- verify pipeline
SELECT * FROM temperatures;
SELECT count(*) FROM temperatures;
SELECT * FROM information_schema.PIPELINES_BATCHES_SUMMARY;
SELECT location, count(*) FROM temperatures GROUP BY 1;


-- Average temperature all year
SELECT location, ROUND(AVG(temperatureF),1) as 'avg temp F' FROM temperatures GROUP BY location;

-- Average temperature by day
SELECT location, read_date :> date,
ROUND(AVG(temperatureF), 1) as 'avg temp F', MIN(temperatureF), MAX(temperatureF)
FROM temperatures
GROUP by 1, 2
ORDER BY 1, 2;

-- Average weekly temperature in March
SELECT location, TIME_BUCKET("7d", read_date) as 'week',
ROUND(AVG(temperatureF), 1) as 'avg temp F', MIN(temperatureF), MAX(temperatureF)
FROM temperatures
WHERE read_date >= '2010-03-01' AND read_date <= '2010-03-31'
GROUP BY 1, 2 ORDER BY 1, 2;


-- Candlestick chart for July
WITH ranked AS (
  SELECT location, read_date,
  RANK() OVER w as r,
  MIN(temperatureF) over w as 'min',
  MAX(temperatureF) over w as 'max',
  FIRST_VALUE(temperatureF) over w as 'first',
  LAST_VALUE(temperatureF) over w as 'last'
  FROM temperatures
  WINDOW w AS (
    PARTITION BY location, time_bucket('1d', read_date)
    ORDER BY read_date
    ROWS BETWEEN UNBOUNDED PRECEDING
    AND UNBOUNDED FOLLOWING
  )
)
SELECT location, time_bucket('1d', read_date) as 'date',
min, max, first, last
FROM ranked
WHERE r = 1
AND (read_date >= '2010-07-01' AND read_date <= '2010-07-31')
ORDER BY 1, 2;


-- smoothing: average 3 preceeding rows into current row
SELECT location, read_date, temperatureF,
AVG(temperatureF) OVER (ORDER BY location, read_date ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS smoothed_temp
FROM temperatures
WHERE (read_date >= '2010-03-01' AND read_date <= '2010-03-31')
ORDER BY 1, 2;


-- Cleanup
STOP PIPELINE temperatures;
DROP PIPELINE temperatures;
DROP TABLE temperatures;
DROP DATABASE temp_history;
	-- TIME SERIES
	-- ===========

	-- setup schema
	CREATE DATABASE temp_history;

	USE temp_history;

	CREATE TABLE temperatures (
	location VARCHAR(200) NOT NULL,
	read_date DATETIME(6) NOT NULL,
	latitude DOUBLE,
	longitude DOUBLE,
	temperatureF DOUBLE,
	KEY (read_date, location) USING CLUSTERED COLUMNSTORE
	);

	-- load values
	-- Thank you to https://www.ncdc.noaa.gov/ for 2010's data
	CREATE PIPELINE temperatures
	AS LOAD DATA FS '/vagrant/noaa-weather-data.txt'
	INTO TABLE temperatures
	FORMAT CSV
	FIELDS TERMINATED BY '\t'
	LINES TERMINATED BY '\n'
	IGNORE 1 LINES;

	-- start pipeline
	TEST PIPELINE temperatures LIMIT 10;
	START PIPELINE temperatures FOREGROUND LIMIT 1 BATCHES;
	START PIPELINE temperatures;

	-- verify pipeline
	SELECT * FROM temperatures;
	SELECT count(*) FROM temperatures;
	SELECT * FROM information_schema.PIPELINES_BATCHES_SUMMARY;
	SELECT location, count(*) FROM temperatures GROUP BY 1;


	-- Average temperature all year
	SELECT location, ROUND(AVG(temperatureF),1) as 'avg temp F' FROM temperatures GROUP BY location;

	-- Average temperature by day
	SELECT location, read_date :> date,
	ROUND(AVG(temperatureF), 1) as 'avg temp F', MIN(temperatureF), MAX(temperatureF)
	FROM temperatures
	GROUP by 1, 2
	ORDER BY 1, 2;

	-- Average weekly temperature in March
	SELECT location, TIME_BUCKET("7d", read_date) as 'week',
	ROUND(AVG(temperatureF), 1) as 'avg temp F', MIN(temperatureF), MAX(temperatureF)
	FROM temperatures
	WHERE read_date >= '2010-03-01' AND read_date <= '2010-03-31'
	GROUP BY 1, 2 ORDER BY 1, 2;


	-- Candlestick chart for July
	WITH ranked AS (
	SELECT location, read_date,
	RANK() OVER w as r,
	MIN(temperatureF) over w as 'min',
	MAX(temperatureF) over w as 'max',
	FIRST_VALUE(temperatureF) over w as 'first',
	LAST_VALUE(temperatureF) over w as 'last'
	FROM temperatures
	WINDOW w AS (
	PARTITION BY location, time_bucket('1d', read_date)
	ORDER BY read_date
	ROWS BETWEEN UNBOUNDED PRECEDING
	AND UNBOUNDED FOLLOWING
	)
	)
	SELECT location, time_bucket('1d', read_date) as 'date',
	min, max, first, last
	FROM ranked
	WHERE r = 1
	AND (read_date >= '2010-07-01' AND read_date <= '2010-07-31')
	ORDER BY 1, 2;


	-- smoothing: average 3 preceeding rows into current row
	SELECT location, read_date, temperatureF,
	AVG(temperatureF) OVER (ORDER BY location, read_date ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS smoothed_temp
	FROM temperatures
	WHERE (read_date >= '2010-03-01' AND read_date <= '2010-03-31')
	ORDER BY 1, 2;


	-- Cleanup
	STOP PIPELINE temperatures;
	DROP PIPELINE temperatures;
	DROP TABLE temperatures;
	DROP DATABASE temp_history;