Last active
May 11, 2023 00:39
-
-
Save raven-rock/5f67a98d13c750b9f5611eb230e937af to your computer and use it in GitHub Desktop.
"Pure SQL" algorithm for updating of slowly changing dimension (SCD Type 2) table records with just two DML statements (SQLite example)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- SQLite | |
-- Test: sqlite3 -header -echo -column :memory: < % | |
create table t1 ( | |
bk int | |
, valid_from datetime default CURRENT_TIMESTAMP | |
, valid_to datetime default '9999-12-31 23:59:59' | |
, a1 varchar(20) | |
, primary key (bk, valid_from) | |
); | |
insert into t1 (bk, a1) values | |
(1, 'A') | |
, (2, 'B') | |
, (3, 'C') | |
, (4, 'D') | |
; | |
/* | |
Create a dataset to repesent our staging table. | |
*/ | |
create temp table stage ( | |
bk int primary key | |
, a1 varchar(20) | |
); | |
-- Remove 3 | |
-- New 5 | |
-- Change 2 | |
insert into stage (bk, a1) values | |
(1, 'Y') -- Changed to Y | |
, (2, 'Z') -- Changed to Z | |
-- , (3, 'C') -- Removed | |
, (4, 'D') | |
, (5, 'E') -- New | |
, (6, 'F') -- New | |
; | |
/* | |
We are using multiple DML statements, so best practice is to wrap it in a | |
transaction. | |
*/ | |
begin; | |
/* | |
Create a "variable" to house the timestamp when the procedure begins so that | |
in case the following UPDATE and INSERT statements take more than a second -- | |
which in production they most certainly will -- so that we do not have any gap | |
between the valid_to of outdated SCD record and valid_from of the new SCD | |
record. (SQLite doesn't have support for variables directly, so emulating it | |
with a 1-dimensional temp table. | |
*/ | |
create temp table _timestamp as select CURRENT_TIMESTAMP ts; | |
/* | |
UPDATE the "deleted" and outdated records by filling in the valid_from with | |
the ts to logically "close" them from an SCD2 standpoint. | |
*/ | |
update t1 | |
set valid_to = (select ts from _timestamp) | |
where bk in ( | |
select bk | |
from t1 | |
where 1=1 | |
and valid_to = '9999-12-31 23:59:59' | |
and ( | |
-- "Outdated" records | |
exists ( | |
select bk | |
from stage | |
where stage.bk = t1.bk | |
and (stage.a1 <> t1.a1) | |
) | |
/* | |
"Deleted" records - remove this next "OR NOT EXISTS" clause if you never | |
want to close a BK completely i.e. always keep the last record seen | |
"open". | |
*/ | |
or not exists ( | |
select bk | |
from stage | |
where stage.bk = t1.bk | |
) | |
) | |
) | |
; | |
/* | |
Insert records for both new and "changed" records. | |
*/ | |
insert into t1 | |
( | |
bk | |
, valid_from | |
, valid_to | |
, a1 | |
) | |
select | |
bk | |
, datetime(_timestamp.ts,'+1 seconds') as valid_from | |
, '9999-12-31 23:59:59' as valid_to | |
, a1 | |
from | |
stage | |
join _timestamp | |
where not exists ( | |
select 1 | |
from t1 | |
where t1.bk = stage.bk | |
and t1.valid_to = '9999-12-31 23:59:59' | |
) | |
; | |
commit; | |
/* | |
Check result. | |
*/ | |
-- All SCD records | |
select * from t1; | |
-- Closed records | |
select * from t1 where valid_to <> '9999-12-31 23:59:59' ; | |
-- Current dataset | |
select * from t1 where valid_to ='9999-12-31 23:59:59' ; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment