Last active
June 14, 2022 16:53
-
-
Save thuwarakeshm/8fd15178cbfbac59c65e36e51c66f028 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
schema = pa.DataFrameSchema( | |
{ | |
"order_value": Column( | |
"int64", | |
[ | |
Check.less_than(1000), | |
Check.greater_than(100), | |
Check(lambda x: x.sum() > 1000), | |
], | |
), | |
"gender": Column("str", [Check.isin(["M", "F"])]), | |
"age": Column("int64", [Check.less_than(40), Check.greater_than(0)]), | |
"batch": Column("str", [Check.str_matches("\d{4}-\w{2}"), Check.statistics.]), | |
"student": Column("bool"), | |
} | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@pa.check_input(schema) | |
@pa.check_output(schema2) | |
def avg_age_by_studentship(df): | |
return df.groupby("student").age.mean().reset_index() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
schema_type: dataframe | |
version: 0.11.0 | |
columns: | |
# This configuration branch is about the 'order_value' column | |
order_value: | |
# Check if data type is int64 | |
dtype: int64 | |
# This field can have null values | |
nullable: false | |
checks: | |
# check if values don't exceed 1000 | |
less_than: 1000 | |
# check if values aren't lower than 100 | |
greater_than: 100 | |
unique: false | |
# if dtype is different try converting it before raising an error | |
coerce: true | |
# this field is required | |
required: true | |
# fieldname is not a regex match | |
regex: false | |
gender: | |
dtype: str | |
nullable: false | |
checks: | |
# check if values are in a predefined list | |
isin: | |
- M | |
- F | |
unique: false | |
coerce: false | |
required: true | |
regex: false | |
age: | |
dtype: int64 | |
nullable: false | |
checks: | |
less_than: 40 | |
greater_than: 0 | |
unique: false | |
coerce: false | |
required: true | |
regex: false | |
batch: | |
dtype: str | |
nullable: false | |
checks: null | |
# batch id should be unique | |
unique: true | |
coerce: false | |
required: true | |
regex: false | |
student: | |
dtype: bool | |
nullable: false | |
checks: null | |
unique: false | |
# try converting before raising an error | |
coerce: true | |
required: true | |
regex: false | |
checks: null | |
index: null | |
coerce: false | |
strict: false | |
unique: null |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
schema_type: dataframe | |
version: 0.11.0 | |
columns: | |
student: | |
dtype: bool | |
nullable: false | |
checks: null | |
unique: false | |
coerce: false | |
required: true | |
regex: false | |
age: | |
dtype: float64 | |
nullable: false | |
checks: | |
less_than: 24 | |
greater_than: 5 | |
unique: false | |
coerce: false | |
required: true | |
regex: false | |
checks: null | |
index: null | |
coerce: false | |
strict: false | |
unique: null |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df = pd.DataFrame( | |
{ | |
"order_value": [841, 487, 208, 571, 554, 225, 186, 338, 996, 260], | |
"gender": ["M", "M", "M", "M", "M", "F", "M", "M", "F", "F"], | |
"age": [10, 18, 8, 24, 23, 5, 23, 10, 16, 19], | |
"batch": [ | |
"2938-SS", | |
"2309-TT", | |
"2309-SW", | |
"0923-OW", | |
"5615-SD", | |
"2320-LI", | |
"0932-SO", | |
"2308-WE", | |
"9832-TC", | |
"1092-PW", | |
], | |
"student": [True, True, False, True, False, True, True, True, False, True], | |
} | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
less_than: 800 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
schema = pa.DataFrameSchema( | |
{ | |
"order_value": Column( | |
"int64", | |
[ | |
Check.less_than(1000), | |
Check.greater_than(100), | |
Check(lambda x: x.sum() > 1000), | |
Hypothesis.two_sample_ttest( | |
sample1="M", | |
sample2="F", | |
groupby="gender", | |
alpha=0.05, | |
equal_var=True, | |
relationship="greater_than", | |
), | |
], | |
), | |
"gender": Column("str", [Check.isin(["M", "F"])]), | |
} | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
with open("validation.yml", "r") as f: | |
schema_config = f.read() | |
schema = pa.io.from_yaml(schema_config) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
schema = pa.DataFrameSchema( | |
{ | |
"order_value": Column( | |
"int64", [Check.less_than(1000), Check.greater_than(100)] | |
), | |
"gender": Column("str", [Check.isin(["M", "F"])]), | |
"age": Column("int64", [Check.less_than(40), Check.greater_than(0)]), | |
"batch": Column("str"), | |
"student": Column("bool"), | |
} | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment