Created
March 18, 2020 12:30
-
-
Save jmriego/7cc4ec704656e579f7afd22b16f90b61 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{# | |
Generates an SCD2 from daily rows per id | |
Example: | |
Input: client_data | |
| id | color | name | city | dt | | |
|------+-------+-------+--------+------------| | |
| 1 | red | John | Jerez | 2020-01-01 | | |
| 1 | blue | John | Jerez | 2020-01-02 | | |
| 1 | red | John | Cadiz | 2020-01-03 | | |
| 1 | red | John | Jerez | 2020-01-04 | | |
| 2 | red | Mary | Cork | 2020-01-01 | | |
| 2 | red | Mary | Dublin | 2020-01-02 | | |
{{ scd2( | |
"client_data", | |
columns=["color", "name"], | |
key_columns=["id"], | |
date_column="dt", | |
dt_from_to_columns=["date_from", "date_to"]) }} | |
Output: | |
| id | color | name | date_from | date_to | | |
|------+-------+-------+------------+------------| | |
| 1 | red | John | 2020-01-01 | 2020-01-01 | | |
| 1 | blue | John | 2020-01-02 | 2020-01-02 | | |
| 1 | red | John | 2020-01-03 | 2020-01-04 | | |
| 2 | red | Mary | 2020-01-01 | 2020-01-02 | | |
Arguments: | |
relation_alias: the alias we are selecting from | |
columns: Column name list for the data, required | |
key_columns: Column key name list, required. | |
It would usually align to the primary key of the source | |
(not including the date columns) | |
date_column: column name with the date | |
dt_from_to_columns: list of two columns with the names we want | |
for our date_from,date_to columns of the scd2 | |
#} | |
{% macro scd2(relation_alias, | |
columns, | |
key_columns, | |
date_column, | |
dt_from_to_columns=["date_from", "date_to"]) %} | |
WITH dbt_SCD2__{{ relation_alias }}__add_row_hash AS ( | |
SELECT | |
{{ key_columns|join(",") }}, | |
{{ date_column }}, | |
{{ columns|join(",") }}, | |
{# calculate a hash of the values used for the scd2 #} | |
{{ dbt_utils.surrogate_key(*columns) }} AS dbt_SCD2__row_hash | |
FROM | |
{{ relation_alias }} | |
), dbt_SCD2__{{ relation_alias }}__prev_row_hash AS ( | |
select | |
dbt_SCD2__{{ relation_alias }}__add_row_hash.*, | |
{# show the hash of previous row. every time the hash is different to the previous one, #} | |
{# it means there is a new row in an scd2 #} | |
lag(dbt_SCD2__row_hash | |
) over ( | |
partition by | |
{{ key_columns|join(",") }} | |
order by | |
{{ date_column }} | |
) AS dbt_SCD2__prev_row_hash | |
FROM | |
dbt_SCD2__{{ relation_alias }}__add_row_hash | |
), dbt_SCD2__{{ relation_alias }}__date_changed AS ( | |
select | |
dbt_SCD2__{{ relation_alias }}__prev_row_hash.*, | |
{# go through rows sorted by date and increase the "surrogate_key" value for every time the hash changed #} | |
SUM( | |
CASE WHEN dbt_SCD2__row_hash <> dbt_SCD2__prev_row_hash THEN 1 ELSE 0 END | |
) over ( | |
partition by | |
{{ key_columns|join(",") }} | |
order by | |
{{ date_column }} | |
) AS dbt_SCD2__surrogate_key | |
from | |
dbt_SCD2__{{ relation_alias }}__prev_row_hash | |
), dbt_SCD2__{{ relation_alias }}__result AS ( | |
select | |
{{ key_columns|join(",") }}, | |
{% for col in columns %} | |
any_value({{ col }}) AS {{ col }}, | |
{% endfor %} | |
min({{ date_column }}) AS {{ dt_from_to_columns[0] }}, | |
max({{ date_column }}) AS {{ dt_from_to_columns[1] }} | |
from dbt_SCD2__{{ relation_alias }}__date_changed | |
group by | |
{% for col in key_columns %} | |
{{ col }}, | |
{% endfor %} | |
dbt_SCD2__surrogate_key | |
) | |
select * from dbt_SCD2__{{ relation_alias }}__result | |
{% endmacro %} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment