thuwarakeshm/additional.py

## additional.py
schema = pa.DataFrameSchema(
    {
        "order_value": Column(
            "int64",
            [
                Check.less_than(1000),
                Check.greater_than(100),
                Check(lambda x: x.sum() > 1000),
            ],
        ),
        "gender": Column("str", [Check.isin(["M", "F"])]),
        "age": Column("int64", [Check.less_than(40), Check.greater_than(0)]),
        "batch": Column("str", [Check.str_matches("\d{4}-\w{2}"), Check.statistics.]),
        "student": Column("bool"),
    }
)

## annotate.py
@pa.check_input(schema)
@pa.check_output(schema2)
def avg_age_by_studentship(df):
    return df.groupby("student").age.mean().reset_index()

## config.yaml
schema_type: dataframe
version: 0.11.0
columns:

  # This configuration branch is about the 'order_value' column
  order_value:

    # Check if data type is int64
    dtype: int64

    # This field can have null values
    nullable: false

    checks:

      # check if values don't exceed 1000
      less_than: 1000

      # check if values aren't lower than 100
      greater_than: 100

    unique: false

    # if dtype is different try converting it before raising an error
    coerce: true

    # this field is required
    required: true

    # fieldname is not a regex match
    regex: false

  gender:
    dtype: str
    nullable: false
    checks:

      # check if values are in a predefined list
      isin:
      - M
      - F

    unique: false
    coerce: false
    required: true
    regex: false
  age:
    dtype: int64
    nullable: false
    checks:
      less_than: 40
      greater_than: 0
    unique: false
    coerce: false
    required: true
    regex: false
  batch:
    dtype: str
    nullable: false
    checks: null

    # batch id should be unique
    unique: true

    coerce: false
    required: true
    regex: false
  student:
    dtype: bool
    nullable: false
    checks: null
    unique: false

    # try converting before raising an error
    coerce: true

    required: true
    regex: false
checks: null
index: null
coerce: false
strict: false
unique: null

## config2.yaml
schema_type: dataframe
version: 0.11.0
columns:
  student:
    dtype: bool
    nullable: false
    checks: null
    unique: false
    coerce: false
    required: true
    regex: false
  age:
    dtype: float64
    nullable: false
    checks:
      less_than: 24
      greater_than: 5
    unique: false
    coerce: false
    required: true
    regex: false
checks: null
index: null
coerce: false
strict: false
unique: null

## data.py
df = pd.DataFrame(
    {
        "order_value": [841, 487, 208, 571, 554, 225, 186, 338, 996, 260],
        "gender": ["M", "M", "M", "M", "M", "F", "M", "M", "F", "F"],
        "age": [10, 18, 8, 24, 23, 5, 23, 10, 16, 19],
        "batch": [
            "2938-SS",
            "2309-TT",
            "2309-SW",
            "0923-OW",
            "5615-SD",
            "2320-LI",
            "0932-SO",
            "2308-WE",
            "9832-TC",
            "1092-PW",
        ],
        "student": [True, True, False, True, False, True, True, True, False, True],
    }
)

## error.yaml
less_than: 800

## hypo.py
schema = pa.DataFrameSchema(
    {
        "order_value": Column(
            "int64",
            [
                Check.less_than(1000),
                Check.greater_than(100),
                Check(lambda x: x.sum() > 1000),
                Hypothesis.two_sample_ttest(
                    sample1="M",
                    sample2="F",
                    groupby="gender",
                    alpha=0.05,
                    equal_var=True,
                    relationship="greater_than",
                ),
            ],
        ),
        "gender": Column("str", [Check.isin(["M", "F"])]),
    }
)

## loadyaml.py
with open("validation.yml", "r") as f:
    schema_config = f.read()

schema = pa.io.from_yaml(schema_config)

## schema.py
schema = pa.DataFrameSchema(
    {
        "order_value": Column(
            "int64", [Check.less_than(1000), Check.greater_than(100)]
        ),
        "gender": Column("str", [Check.isin(["M", "F"])]),
        "age": Column("int64", [Check.less_than(40), Check.greater_than(0)]),
        "batch": Column("str"),
        "student": Column("bool"),
    }
)
	schema = pa.DataFrameSchema(
	{
	"order_value": Column(
	"int64",
	[
	Check.less_than(1000),
	Check.greater_than(100),
	Check(lambda x: x.sum() > 1000),
	],
	),
	"gender": Column("str", [Check.isin(["M", "F"])]),
	"age": Column("int64", [Check.less_than(40), Check.greater_than(0)]),
	"batch": Column("str", [Check.str_matches("\d{4}-\w{2}"), Check.statistics.]),
	"student": Column("bool"),
	}
	)
	@pa.check_input(schema)
	@pa.check_output(schema2)
	def avg_age_by_studentship(df):
	return df.groupby("student").age.mean().reset_index()
	schema_type: dataframe
	version: 0.11.0
	columns:

	# This configuration branch is about the 'order_value' column
	order_value:

	# Check if data type is int64
	dtype: int64

	# This field can have null values
	nullable: false

	checks:

	# check if values don't exceed 1000
	less_than: 1000

	# check if values aren't lower than 100
	greater_than: 100

	unique: false

	# if dtype is different try converting it before raising an error
	coerce: true

	# this field is required
	required: true

	# fieldname is not a regex match
	regex: false

	gender:
	dtype: str
	nullable: false
	checks:

	# check if values are in a predefined list
	isin:
	- M
	- F

	unique: false
	coerce: false
	required: true
	regex: false
	age:
	dtype: int64
	nullable: false
	checks:
	less_than: 40
	greater_than: 0
	unique: false
	coerce: false
	required: true
	regex: false
	batch:
	dtype: str
	nullable: false
	checks: null

	# batch id should be unique
	unique: true

	coerce: false
	required: true
	regex: false
	student:
	dtype: bool
	nullable: false
	checks: null
	unique: false

	# try converting before raising an error
	coerce: true

	required: true
	regex: false
	checks: null
	index: null
	coerce: false
	strict: false
	unique: null
	df = pd.DataFrame(
	{
	"order_value": [841, 487, 208, 571, 554, 225, 186, 338, 996, 260],
	"gender": ["M", "M", "M", "M", "M", "F", "M", "M", "F", "F"],
	"age": [10, 18, 8, 24, 23, 5, 23, 10, 16, 19],
	"batch": [
	"2938-SS",
	"2309-TT",
	"2309-SW",
	"0923-OW",
	"5615-SD",
	"2320-LI",
	"0932-SO",
	"2308-WE",
	"9832-TC",
	"1092-PW",
	],
	"student": [True, True, False, True, False, True, True, True, False, True],
	}
	)
	with open("validation.yml", "r") as f:
	schema_config = f.read()

	schema = pa.io.from_yaml(schema_config)
	schema = pa.DataFrameSchema(
	{
	"order_value": Column(
	"int64", [Check.less_than(1000), Check.greater_than(100)]
	),
	"gender": Column("str", [Check.isin(["M", "F"])]),
	"age": Column("int64", [Check.less_than(40), Check.greater_than(0)]),
	"batch": Column("str"),
	"student": Column("bool"),
	}
	)