Skip to content

Instantly share code, notes, and snippets.

@raven-rock
Last active May 11, 2023 00:39
Show Gist options
  • Save raven-rock/5f67a98d13c750b9f5611eb230e937af to your computer and use it in GitHub Desktop.
Save raven-rock/5f67a98d13c750b9f5611eb230e937af to your computer and use it in GitHub Desktop.
"Pure SQL" algorithm for updating of slowly changing dimension (SCD Type 2) table records with just two DML statements (SQLite example)
-- SQLite
-- Test: sqlite3 -header -echo -column :memory: < %
create table t1 (
bk int
, valid_from datetime default CURRENT_TIMESTAMP
, valid_to datetime default '9999-12-31 23:59:59'
, a1 varchar(20)
, primary key (bk, valid_from)
);
insert into t1 (bk, a1) values
(1, 'A')
, (2, 'B')
, (3, 'C')
, (4, 'D')
;
/*
Create a dataset to repesent our staging table.
*/
create temp table stage (
bk int primary key
, a1 varchar(20)
);
-- Remove 3
-- New 5
-- Change 2
insert into stage (bk, a1) values
(1, 'Y') -- Changed to Y
, (2, 'Z') -- Changed to Z
-- , (3, 'C') -- Removed
, (4, 'D')
, (5, 'E') -- New
, (6, 'F') -- New
;
/*
We are using multiple DML statements, so best practice is to wrap it in a
transaction.
*/
begin;
/*
Create a "variable" to house the timestamp when the procedure begins so that
in case the following UPDATE and INSERT statements take more than a second --
which in production they most certainly will -- so that we do not have any gap
between the valid_to of outdated SCD record and valid_from of the new SCD
record. (SQLite doesn't have support for variables directly, so emulating it
with a 1-dimensional temp table.
*/
create temp table _timestamp as select CURRENT_TIMESTAMP ts;
/*
UPDATE the "deleted" and outdated records by filling in the valid_from with
the ts to logically "close" them from an SCD2 standpoint.
*/
update t1
set valid_to = (select ts from _timestamp)
where bk in (
select bk
from t1
where 1=1
and valid_to = '9999-12-31 23:59:59'
and (
-- "Outdated" records
exists (
select bk
from stage
where stage.bk = t1.bk
and (stage.a1 <> t1.a1)
)
/*
"Deleted" records - remove this next "OR NOT EXISTS" clause if you never
want to close a BK completely i.e. always keep the last record seen
"open".
*/
or not exists (
select bk
from stage
where stage.bk = t1.bk
)
)
)
;
/*
Insert records for both new and "changed" records.
*/
insert into t1
(
bk
, valid_from
, valid_to
, a1
)
select
bk
, datetime(_timestamp.ts,'+1 seconds') as valid_from
, '9999-12-31 23:59:59' as valid_to
, a1
from
stage
join _timestamp
where not exists (
select 1
from t1
where t1.bk = stage.bk
and t1.valid_to = '9999-12-31 23:59:59'
)
;
commit;
/*
Check result.
*/
-- All SCD records
select * from t1;
-- Closed records
select * from t1 where valid_to <> '9999-12-31 23:59:59' ;
-- Current dataset
select * from t1 where valid_to ='9999-12-31 23:59:59' ;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment