Last active
June 16, 2023 13:36
-
-
Save villoro/68c5f1a4c81ed9c649f6329a7f811315 to your computer and use it in GitHub Desktop.
Parsing data with pydantic
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datetime import datetime, date | |
from typing import Dict, List, Optional | |
import yaml | |
from pydantic import BaseModel, ValidationError, validator, root_validator, Field | |
class Column(BaseModel): | |
name: str = Field(regex=r"[a-z][a-z_]*[a-z]") | |
comment: str | |
partition_column: Optional[bool] = False | |
@root_validator | |
def check_correct_naming_for_partition_cols(cls, values): | |
if values.get("partition_column"): | |
msg = "The name of a partition column must start with 'p_'" | |
assert values.get("name", "").startswith("p_"), msg | |
return values | |
class Table(BaseModel): | |
files_per_partition: int = Field(ge=1) # info: https://pydantic-docs.helpmanual.io/usage/schema/ | |
columns: List[Column] | |
partition_columns: Optional[List[str]] | |
@validator("files_per_partition") | |
def check_files_per_partition(v): | |
if v > 100: | |
log.warning( | |
"You shouldn't write more than 100 files per partition. " | |
"This will imply poor reading performance" | |
) | |
return v | |
@validator("partition_columns", pre=True, always=True) | |
def check_partition_columns(v, values): | |
if v: | |
raise ValueError( | |
"'partition_columns' shouldn't be populated since they will be automatically created" | |
) | |
columns = values.get("columns") | |
if not columns: | |
raise ValueError("Columns are missing or wrongly defined") | |
# Extract from columns info | |
partition_columns = [c.name for c in columns if c.partition_column] | |
assert len(partition_columns) == 1, "There must be only one partition_column" | |
return partition_columns | |
# This works well | |
metadata = yaml.safe_load(""" | |
files_per_partition: 1 | |
columns: | |
- | |
name: age | |
comment: Age in years for the person | |
- | |
name: p_creation_date | |
comment: String date in format 'YYYY-MM-DD' | |
partition_column: True | |
- | |
name: city | |
comment: IATA city code | |
""") | |
table = Table.parse_obj(metadata) | |
# This fails | |
metadata2 = yaml.safe_load(""" | |
files_per_partition: 0 | |
columns: | |
- | |
name: age | |
- | |
name: p_creation_date | |
comment: String date in format 'YYYY-MM-DD' | |
partition_column: True | |
- | |
name: City | |
comment: IATA city code | |
partition_column: True | |
""") | |
table2 = Table.parse_obj(metadata2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment