Skip to content

Instantly share code, notes, and snippets.

@thuwarakeshm
Last active June 14, 2022 16:53
Show Gist options
  • Save thuwarakeshm/8fd15178cbfbac59c65e36e51c66f028 to your computer and use it in GitHub Desktop.
Save thuwarakeshm/8fd15178cbfbac59c65e36e51c66f028 to your computer and use it in GitHub Desktop.
schema = pa.DataFrameSchema(
{
"order_value": Column(
"int64",
[
Check.less_than(1000),
Check.greater_than(100),
Check(lambda x: x.sum() > 1000),
],
),
"gender": Column("str", [Check.isin(["M", "F"])]),
"age": Column("int64", [Check.less_than(40), Check.greater_than(0)]),
"batch": Column("str", [Check.str_matches("\d{4}-\w{2}"), Check.statistics.]),
"student": Column("bool"),
}
)
@pa.check_input(schema)
@pa.check_output(schema2)
def avg_age_by_studentship(df):
return df.groupby("student").age.mean().reset_index()
schema_type: dataframe
version: 0.11.0
columns:
# This configuration branch is about the 'order_value' column
order_value:
# Check if data type is int64
dtype: int64
# This field can have null values
nullable: false
checks:
# check if values don't exceed 1000
less_than: 1000
# check if values aren't lower than 100
greater_than: 100
unique: false
# if dtype is different try converting it before raising an error
coerce: true
# this field is required
required: true
# fieldname is not a regex match
regex: false
gender:
dtype: str
nullable: false
checks:
# check if values are in a predefined list
isin:
- M
- F
unique: false
coerce: false
required: true
regex: false
age:
dtype: int64
nullable: false
checks:
less_than: 40
greater_than: 0
unique: false
coerce: false
required: true
regex: false
batch:
dtype: str
nullable: false
checks: null
# batch id should be unique
unique: true
coerce: false
required: true
regex: false
student:
dtype: bool
nullable: false
checks: null
unique: false
# try converting before raising an error
coerce: true
required: true
regex: false
checks: null
index: null
coerce: false
strict: false
unique: null
schema_type: dataframe
version: 0.11.0
columns:
student:
dtype: bool
nullable: false
checks: null
unique: false
coerce: false
required: true
regex: false
age:
dtype: float64
nullable: false
checks:
less_than: 24
greater_than: 5
unique: false
coerce: false
required: true
regex: false
checks: null
index: null
coerce: false
strict: false
unique: null
df = pd.DataFrame(
{
"order_value": [841, 487, 208, 571, 554, 225, 186, 338, 996, 260],
"gender": ["M", "M", "M", "M", "M", "F", "M", "M", "F", "F"],
"age": [10, 18, 8, 24, 23, 5, 23, 10, 16, 19],
"batch": [
"2938-SS",
"2309-TT",
"2309-SW",
"0923-OW",
"5615-SD",
"2320-LI",
"0932-SO",
"2308-WE",
"9832-TC",
"1092-PW",
],
"student": [True, True, False, True, False, True, True, True, False, True],
}
)
less_than: 800
schema = pa.DataFrameSchema(
{
"order_value": Column(
"int64",
[
Check.less_than(1000),
Check.greater_than(100),
Check(lambda x: x.sum() > 1000),
Hypothesis.two_sample_ttest(
sample1="M",
sample2="F",
groupby="gender",
alpha=0.05,
equal_var=True,
relationship="greater_than",
),
],
),
"gender": Column("str", [Check.isin(["M", "F"])]),
}
)
with open("validation.yml", "r") as f:
schema_config = f.read()
schema = pa.io.from_yaml(schema_config)
schema = pa.DataFrameSchema(
{
"order_value": Column(
"int64", [Check.less_than(1000), Check.greater_than(100)]
),
"gender": Column("str", [Check.isin(["M", "F"])]),
"age": Column("int64", [Check.less_than(40), Check.greater_than(0)]),
"batch": Column("str"),
"student": Column("bool"),
}
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment