Skip to content

Instantly share code, notes, and snippets.

@PaoloLeonard
Last active December 14, 2023 18:52
Show Gist options
  • Save PaoloLeonard/86e90843d789261e3e69597b54f8f7f1 to your computer and use it in GitHub Desktop.
Save PaoloLeonard/86e90843d789261e3e69597b54f8f7f1 to your computer and use it in GitHub Desktop.
rootsconf-dq
import streamlit as st
import yaml, json
import matplotlib.pyplot as plt
dq_score, dashboards = st.tabs(['Data quality score', 'Dashboards'])
with dq_score:
st.title(f"""Data quality score""")
schema, checks = {}, {}
curr_count_comp, curr_count_cons, curr_count_fresh, curr_count_uniq, curr_count_val = 0, 0, 0, 0, 0
exp_count_comp, exp_count_cons, exp_count_fresh, exp_count_uniq, exp_count_val = 3, 2, 1, 2, 4
col_uploader_schema, col_uploader_checks = st.columns(2)
col_completeness, col_accuracy, col_consistency = st.columns(3)
col_validity, col_uniqueness, col_stewardship = st.columns(3)
contract_args = ['dataset_name', 'columns', 'criticality', 'consumer', 'producer']
with col_uploader_schema:
st.header('Schema uploader')
schema_file = st.file_uploader(label='Schema', type=['yaml', 'yml'], key='schema')
if schema_file:
schema = yaml.safe_load(schema_file)
with col_uploader_checks:
st.header('Checks uploader')
checks_file = st.file_uploader(label='Checks', type=['yaml', 'yml'], key='checks')
if checks_file:
checks = yaml.safe_load(checks_file)
with col_completeness:
with st.expander('Completeness'):
if checks:
for dataset_check in checks:
for check in checks[dataset_check]:
for key in check:
if "COMPLETENESS" in check[key]["name"].upper():
curr_count_comp +=1
if curr_count_comp >= exp_count_comp:
st.write(':green[Great you are covered for the completeness dimension]')
elif 0 < curr_count_comp < exp_count_comp:
st.write(f':orange[Please provide at least {exp_count_comp} completeness checks]')
else:
st.write(':red[Did you even try?...]')
else:
st.write(":red[Please upload a check file]")
with col_accuracy:
with st.expander('Freshness'):
if checks:
for dataset_check in checks:
for check in checks[dataset_check]:
for key in check:
if "FRESHNESS" in check[key]["name"].upper():
curr_count_fresh +=1
if curr_count_fresh >= 1:
st.write(':green[Great you are covered for the freshness dimension]')
else:
st.write(f':red[Did you even try?... We need {exp_count_fresh} more.]')
else:
st.write(":red[Please upload a check file]")
with col_consistency:
with st.expander('Consistency'):
if checks:
for dataset_check in checks:
for check in checks[dataset_check]:
for key in check:
if "CONSISTENCY" in check[key]["name"].upper():
curr_count_cons +=1
if curr_count_cons >= exp_count_cons:
st.write(':green[Great you are covered for the consistency dimension]')
elif 0 < curr_count_cons < exp_count_cons:
st.write(f':orange[Please provide at least {exp_count_cons} consistency checks]')
else:
st.write(':red[Did you even try?...]')
else:
st.write(":red[Please upload a check file]")
with col_uniqueness:
with st.expander('Uniqueness'):
if checks:
for dataset_check in checks:
for check in checks[dataset_check]:
for key in check:
if "UNIQUENESS" in check[key]["name"].upper():
curr_count_uniq +=1
if curr_count_uniq >= exp_count_uniq:
st.write(':green[Great you are covered for the uniqueness dimension]')
elif 0 < curr_count_uniq < exp_count_uniq:
st.write(f':orange[Please provide at least {exp_count_uniq} uniqueness checks]')
else:
st.write(':red[Did you even try?...]')
else:
st.write(":red[Please upload a check file]")
with col_validity:
with st.expander('Validity'):
if checks:
for dataset_check in checks:
for check in checks[dataset_check]:
for key in check:
if "VALIDITY" in check[key]["name"].upper():
curr_count_val +=1
if curr_count_val >= exp_count_val:
st.write(':green[Great you are covered for the validity dimension]')
elif 0 < curr_count_val < exp_count_val:
st.write(f':orange[Please provide at least {exp_count_val} validity checks]')
else:
st.write(':red[Did you even try?...]')
else:
st.write(":red[Please upload a check file]")
steward_score = 0
with col_stewardship:
with st.expander('Stewardship'):
if schema:
for arg in contract_args:
if not schema.get(arg):
steward_score += 1
st.write(f':red[We need a {arg}]')
if schema.get('columns'):
for col in schema.get('columns'):
if not schema.get('columns')[col].get('description') or not schema.get('columns')[col].get('type'):
st.write(f':red[Missing type or description for column {col}]')
else:
steward_score = len(contract_args)
st.write(':red[Please provide a schema file.]')
steward_score = (len(contract_args) - steward_score) / len(contract_args)
dq_score = curr_count_comp/exp_count_comp + curr_count_cons/exp_count_cons + curr_count_fresh/exp_count_fresh + curr_count_uniq/exp_count_uniq + curr_count_val/exp_count_val
st.header(f"DQ score: {(dq_score/5+steward_score)*50}%")
with dashboards:
st.header('Dashboards')
st.subheader('Results uploader')
checks = []
results_file = st.file_uploader(label='Results', type=['json'], key='results')
data_health_bucket, freshness_bucket, cons_bucket, comp_bucket, val_bucket, uniquess_bucket = {'total': 0, 'fail': 0}, {'total': 0, 'fail': 0}, {'total': 0, 'fail': 0}, {'total': 0, 'fail': 0}, {'total': 0, 'fail': 0}, {'total': 0, 'fail': 0}
data_health, freshness_col_graph, cons_col_graph = st.columns(3)
comp_col_graph, val_col_graph, uniqueness_col_graph = st.columns(3)
failed = False
if results_file:
results = json.load(results_file)
failed = results.get('hasFailures')
checks = results.get('checks')
for result in checks:
if 'FRESHNESS' in result.get('name').upper():
freshness_bucket['total'] += 1
if 'CONSISTENCY' in result.get('name').upper():
cons_bucket['total'] += 1
if 'COMPLETENESS' in result.get('name').upper():
comp_bucket['total'] += 1
if 'VALIDITY' in result.get('name').upper():
val_bucket['total'] += 1
if 'UNIQUENESS' in result.get('name').upper():
uniquess_bucket['total'] += 1
if result.get('outcome') == 'fail':
data_health_bucket['fail'] += 1
if 'FRESHNESS' in result.get('name').upper():
freshness_bucket['fail'] += 1
if 'CONSISTENCY' in result.get('name').upper():
cons_bucket['fail'] += 1
if 'COMPLETENESS' in result.get('name').upper():
comp_bucket['fail'] += 1
if 'VALIDITY' in result.get('name').upper():
val_bucket['fail'] += 1
if 'UNIQUENESS' in result.get('name').upper():
uniquess_bucket['fail'] += 1
data_health_bucket['total'] += 1
with data_health:
st.subheader("Data health")
labels = 'Passed', 'Failed'
sizes = [data_health_bucket['total']-data_health_bucket['fail'], data_health_bucket['fail']]
if checks and data_health_bucket['total'] > 0:
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, colors=['green', 'red'])
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
st.pyplot(fig1)
with freshness_col_graph:
st.subheader("Freshness")
labels = 'Passed', 'Failed'
sizes = [freshness_bucket['total']-freshness_bucket['fail'], freshness_bucket['fail']]
if checks and freshness_bucket['total'] > 0:
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, colors=['green', 'red'])
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
st.pyplot(fig1)
with cons_col_graph:
st.subheader("Consistency")
labels = 'Passed', 'Failed'
sizes = [cons_bucket['total']-cons_bucket['fail'], cons_bucket['fail']]
if checks and cons_bucket['total'] > 0:
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, colors=['green', 'red'])
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
st.pyplot(fig1)
with comp_col_graph:
st.subheader("Completeness")
labels = 'Passed', 'Failed'
sizes = [comp_bucket['total']-comp_bucket['fail'], comp_bucket['fail']]
if checks and comp_bucket['total'] > 0:
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, colors=['green', 'red'])
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
st.pyplot(fig1)
with val_col_graph:
st.subheader("Validity")
labels = 'Passed', 'Failed'
sizes = [val_bucket['total']-val_bucket['fail'], val_bucket['fail']]
if checks and val_bucket['total'] > 0:
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, colors=['green', 'red'])
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
st.pyplot(fig1)
with uniqueness_col_graph:
st.subheader("Uniqueness")
labels = 'Passed', 'Failed'
sizes = [uniquess_bucket['total']-uniquess_bucket['fail'], uniquess_bucket['fail']]
if checks and uniquess_bucket['total'] > 0:
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, colors=['green', 'red'])
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
st.pyplot(fig1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment