Skip to content

Instantly share code, notes, and snippets.

@stkrp
Last active April 26, 2019 13:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save stkrp/f97401ebf4b0f71d2728f91df15bb03f to your computer and use it in GitHub Desktop.
Save stkrp/f97401ebf4b0f71d2728f91df15bb03f to your computer and use it in GitHub Desktop.
from transformation import pipeline, rename_fields, transform_field_value, replace, group_fields
if __name__ == '__main__':
car_transformer = pipeline(
rename_fields({"owner_yob": "owner_year_of_birth"}),
transform_field_value("owner_year_of_birth", int),
transform_field_value("owner_gender", replace({"female": 0, "male": 1}, None)),
group_fields("owner", ("owner_first_name", "owner_last_name", "owner_year_of_birth", "owner_gender")),
transform_field_value(
"owner",
rename_fields({
"owner_first_name": "first_name",
"owner_last_name": "last_name",
"owner_year_of_birth": "year_of_birth",
"owner_gender": "gender",
})
),
transform_field_value("reg_num", str.upper),
transform_field_value("millage", sum),
transform_field_value("engine_type", replace({"petrol": 0, "diesel": 1, "gas": 2}, None)),
)
car_data = {
"owner_first_name": "Vladimir",
"owner_last_name": "Kukov",
"owner_yob": "1977",
"owner_gender": "male",
"reg_num": "a176Bc99",
"millage": [
100000,
56000,
30002,
],
"engine_power_hp": 190.1,
"engine_type": "petrol",
}
print(car_transformer(car_data))
# Output:
#
# {
# 'owner': {
# 'first_name': 'Vladimir',
# 'last_name': 'Kukov',
# 'year_of_birth': 1977,
# 'gender': 1,
# },
# 'reg_num': 'A176BC99',
# 'millage': 186002,
# 'engine_power_hp': 190.1,
# 'engine_type': 0,
# }
#
"""
Transformation layer of ETL
Transformer is a callable object that takes a value, performs certain actions
with it, and returns an altered or new value. Transformers can accept and
return values of different types.
"""
from typing import MutableMapping, Mapping, Any, Callable, Iterable, Hashable
Transformer = Callable[[Any], Any]
# ==== Generic ============================================================== #
def pipeline(*steps: Transformer) -> Transformer:
"""
Combines transformers into a chain of sequential calls.
The order of calls is the same as the order of the arguments (FIFO).
"""
def transformer(item: Any) -> Any:
for step in steps:
item = step(item)
return item
return transformer
def replace(
value_mapping: Mapping[Hashable, Any], default: Any = NotImplemented,
) -> Transformer:
"""
Replaces the values according to the mapping.
Old values for which there are no new values are replaced by the default
value. If the default value is not passed, the old values remain unchanged.
Mapping: old value -> new value.
[!] Return new instance for new value; original instance for old value.
"""
def transformer(value: Hashable) -> Any:
if value in value_mapping:
return value_mapping[value]
elif default is not NotImplemented:
return default
return value
return transformer
# ==== Mappings ============================================================= #
def transform_field_value(
field_name: str, field_value_transformer: Transformer,
) -> Transformer:
"""
Allows you to transform value of single field.
Starts a new branch of the transformation. The original value of the field
will be provided to the transformer when called, and the result of the call
of the transformer will overwrite the current value of the field. If the
field is not found, the original object is returned unchanged.
Due to the fact that the field is provided to the transformer, it is
possible to implement the processing of nested structures using the same
transformer as the argument:
>>> t = transform_field_value('parent', transform_field_value('child', int)) # NoQA
>>> t({'parent': {'child': '123'}})
{'parent': {'child': 123}}
[!] Will change the original object.
# TODO: Move to unittest
# Flat
>>> transformer = transform_field_value('login', str.upper)
>>> input_ = {'login': 'bOb', 'password': '123qwe'}
>>> output = transformer(input_)
>>> output == {'login': 'BOB', 'password': '123qwe'}
True
>>> output is input_
True
# Nested
>>> transformer = transform_field_value('a', transform_field_value('b', str.upper)) # NoQA
>>> input_ = {'id': 59, 'a': {'b': 'gog', 'key': 'abc'}}
>>> output = transformer(input_)
>>> output == {'id': 59, 'a': {'b': 'GOG', 'key': 'abc'}}
True
>>> output is input_
True
>>> output['a'] is input_['a']
True
# Empty
>>> transformer = transform_field_value('pk', str.upper)
>>> input_ = {'a': 1, 'b': 2}
>>> output = transformer(input_)
>>> output == {'a': 1, 'b': 2}
True
>>> output is input_
True
"""
def transformer(item: MutableMapping[str, Any]) -> MutableMapping[str, Any]: # NoQA
if field_name in item:
item[field_name] = field_value_transformer(item[field_name])
return item
return transformer
def rename_fields(field_name_mapping: Mapping[str, str]) -> Transformer:
"""
Replace each old name to new name (rename keys of mapping).
[!] Will change the original object.
"""
def transformer(item: MutableMapping[str, Any]) -> MutableMapping[str, Any]: # NoQA
for old_field_name in field_name_mapping.keys() & item.keys():
new_field_name = field_name_mapping[old_field_name]
item[new_field_name] = item.pop(old_field_name)
return item
return transformer
def group_fields(group_name: str, field_names: Iterable[str]) -> Transformer:
"""
Groups the fields into a new mapping.
From the original mapping the fields are deleted. If the group name is
equal to the field name that is included in this group, the value of the
field will be stored in the group under the original name. But the group
always overwrites an existing field with the same name. Non-existent fields
not put in a group.
[!] Will change the original object.
>>> transformer = group_fields('user', ['name', 'nick', 'gender'])
>>> input_ = {'name': 'Ivan', 'age': 30, 'gender': 'm'}
>>> output = transformer(input_)
>>> output == {'age': 30, 'user': {'name': 'Ivan', 'gender': 'm'}}
True
>>> output is input_
True
>>> transformer = group_fields('user', ['name', 'nick', 'gender', 'user'])
>>> input_ = {'name': 'Ivan', 'gender': 'm', 'age': 30, 'user': 'iv4'}
>>> output = transformer(input_)
>>> output == {'age': 30, 'user': {'name': 'Ivan', 'gender': 'm', 'user': 'iv4'}} # NoQA
True
>>> output is input_
True
"""
field_names = set(field_names)
def transformer(item: MutableMapping[str, Any]) -> MutableMapping[str, Any]: # NoQA
group = {}
for field_name in field_names & item.keys():
group[field_name] = item.pop(field_name)
item[group_name] = group
return item
return transformer
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment