Skip to content

Instantly share code, notes, and snippets.

@bitner
Last active February 23, 2022 21:44
Show Gist options
  • Save bitner/8c4b2ef89eae70b425951a3d7986be71 to your computer and use it in GitHub Desktop.
Save bitner/8c4b2ef89eae70b425951a3d7986be71 to your computer and use it in GitHub Desktop.
pg_funstuff_for_tammo.sql
-- Make sure that you do all of this in your own schema
-- Postgres has a search path where it is always going to create new things (tables, functions, etc) in the first entry that exists in your search_path
SHOW search_path;
-- pgstac lives in the pgstac schema, to use pgstac without needing to prefix everything
-- you always want to have public in your search_path as that is where all the main things in the database are (including postgis - think of it like /usr/bin)
SET SEARCH_PATH to pgstac, public;
-- Create your own schema to play with these things
CREATE SCHEMA playground;
-- Add that schema to your path
SET SEARCH_PATH TO playground, pgstac, public;
-- This change is ONLY going to stick around for the current session
-- You can change the default search_path for a database role by altering the role
ALTER ROLE <myrole> SET SEARCH_PATH TO playground, pgstac, public;
-- SQL Functions documentation - https://www.postgresql.org/docs/13/xfunc-sql.html
-- SQL Function that wraps st_squaretile to return the tiles that a geometry intersects
-- https://postgis.net/docs/ST_SquareGrid.html
CREATE FUNCTION gridcells
(_geom geometry)
RETURNS SETOF geometry -- the "SETOF" says that this query can return multiple results as rows
AS $$
SELECT geom FROM st_squaregrid(5, _geom) WHERE st_intersects(geom, _geom);
$$ LANGUAGE SQL IMMUTABLE; -- the "IMMUTABLE" says that this query will always return the same results given the same arguments
-- SQL Function that does not return anything, but is shorthand to conveniently do something with variables
CREATE TABLE collection_grid_month(
collection_id text,
month timestamptz,
grid geometry,
count int
UNIQUE (collection_id, month, grid)
);
CREATE OR REPLACE FUNCTION update_aggregate(_collection text, _month timestamptz) RETURNS VOID AS $$
WITH newrows AS (
SELECT
id,
datetime,
collection_id,
gridcells(geometry) -- use the set returning function we created above
FROM items
WHERE datetime >= date_trunc('month', _month) AND datetime < (date_trunc('month', _month) + '1 month'::interval) AND collection_id=_collection
), results AS (
SELECT
date_trunc('month', datetime),
collection_id,
geom,
count(*)
FROM t GROUP BY 1,2,3
) INSERT INTO collection_grid_month (month, collection_id, geom, count)
SELECT * FROM results
ON CONFLICT (collection_id, month, grid) -- this has to match the unique constraint that we put on the table
DO UPDATE SET count=EXCLUDED.count WHERE count != EXCLUDED.count; -- make sure that we update records that are already in our table but do not update records that haven't changed
$$ LANGUAGE SQL;
-- now we can update a month/collection by just calling this function
SELECT update_aggregate('spire','2020-01-01');
-- if we want to bulkload things we want to make sure that we use a transaction for each month so that if something bails we don't lose everything
-- \gexec is a command in the psql client that will execute row-by-row the queries that are formed by the previous command https://www.postgresql.org/docs/13/app-psql.html
-- format is a function that can format a string using variables in pg https://www.postgresql.org/docs/13/functions-string.html#FUNCTIONS-STRING-FORMAT
-- you can use $$ to start and end a string so that you don't have to worry about quote escaping
-- in format, %s denotes a string as is %L will make sure to escape a string with quotes as a literal and %I escapes with double quotes as an identifier (ie table name)
SELECT format(
$$
SELECT 'myvar_%s', %L;
$$,
intseries,
tstzseries
) FROM
generate_series(0,10,1) intseries,
generate_series('2020-01-01'::timestamptz, '2021-01-01'::timestamptz, '1 month'::interval) as tstzseries
;
-- you see that this just returns records that are sql statements themselves
-- if we use the \gexec command it will run each query separately and as long as you are not in a transaction, will commit each row separately
\gexec
-- we can use this trick to run each month/collection as it's own transaction with our function we created above
-- we can create a table that we just use as a queue
CREATE TABLE collection_months AS SELECT DISTINCT collection_id, date_trunc('month', datetime) as month FROM items;
SELECT format(
$$
SELECT update_aggregate(%L, %L);
$$,
collection_id,
month
) FROM collection_months m WHERE NOT EXISTS ( -- in case we are doing a rerun, don't try with anything that we already have processed
SELECT TRUE FROM collection_grid_month cgm WHERE m.month=cgm.month and m.collection_id=cgm.collection_id LIMIT 1
);
\gexec
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment