Skip to content

Instantly share code, notes, and snippets.

@villoro
Last active June 16, 2023 13:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save villoro/68c5f1a4c81ed9c649f6329a7f811315 to your computer and use it in GitHub Desktop.
Save villoro/68c5f1a4c81ed9c649f6329a7f811315 to your computer and use it in GitHub Desktop.
Parsing data with pydantic
from datetime import datetime, date
from typing import Dict, List, Optional
import yaml
from pydantic import BaseModel, ValidationError, validator, root_validator, Field
class Column(BaseModel):
name: str = Field(regex=r"[a-z][a-z_]*[a-z]")
comment: str
partition_column: Optional[bool] = False
@root_validator
def check_correct_naming_for_partition_cols(cls, values):
if values.get("partition_column"):
msg = "The name of a partition column must start with 'p_'"
assert values.get("name", "").startswith("p_"), msg
return values
class Table(BaseModel):
files_per_partition: int = Field(ge=1) # info: https://pydantic-docs.helpmanual.io/usage/schema/
columns: List[Column]
partition_columns: Optional[List[str]]
@validator("files_per_partition")
def check_files_per_partition(v):
if v > 100:
log.warning(
"You shouldn't write more than 100 files per partition. "
"This will imply poor reading performance"
)
return v
@validator("partition_columns", pre=True, always=True)
def check_partition_columns(v, values):
if v:
raise ValueError(
"'partition_columns' shouldn't be populated since they will be automatically created"
)
columns = values.get("columns")
if not columns:
raise ValueError("Columns are missing or wrongly defined")
# Extract from columns info
partition_columns = [c.name for c in columns if c.partition_column]
assert len(partition_columns) == 1, "There must be only one partition_column"
return partition_columns
# This works well
metadata = yaml.safe_load("""
files_per_partition: 1
columns:
-
name: age
comment: Age in years for the person
-
name: p_creation_date
comment: String date in format 'YYYY-MM-DD'
partition_column: True
-
name: city
comment: IATA city code
""")
table = Table.parse_obj(metadata)
# This fails
metadata2 = yaml.safe_load("""
files_per_partition: 0
columns:
-
name: age
-
name: p_creation_date
comment: String date in format 'YYYY-MM-DD'
partition_column: True
-
name: City
comment: IATA city code
partition_column: True
""")
table2 = Table.parse_obj(metadata2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment