PaoloLeonard/dq_app.py

## dq_app.py
import streamlit as st
import yaml, json
import matplotlib.pyplot as plt

dq_score, dashboards = st.tabs(['Data quality score', 'Dashboards'])

with dq_score:
    st.title(f"""Data quality score""")

    schema, checks = {}, {}
    curr_count_comp, curr_count_cons, curr_count_fresh, curr_count_uniq, curr_count_val = 0, 0, 0, 0, 0
    exp_count_comp, exp_count_cons, exp_count_fresh, exp_count_uniq, exp_count_val = 3, 2, 1, 2, 4
    col_uploader_schema, col_uploader_checks = st.columns(2)
    col_completeness, col_accuracy, col_consistency = st.columns(3)
    col_validity, col_uniqueness, col_stewardship = st.columns(3)

    contract_args = ['dataset_name', 'columns', 'criticality', 'consumer', 'producer']

    with col_uploader_schema:
        st.header('Schema uploader')
        schema_file = st.file_uploader(label='Schema', type=['yaml', 'yml'], key='schema')
        if schema_file:
            schema = yaml.safe_load(schema_file)

    with col_uploader_checks:
        st.header('Checks uploader')
        checks_file = st.file_uploader(label='Checks', type=['yaml', 'yml'], key='checks')
        if checks_file:
            checks = yaml.safe_load(checks_file)

    with col_completeness:
        with st.expander('Completeness'):
            if checks:
                for dataset_check in checks:
                    for check in checks[dataset_check]:
                        for key in check:
                            if "COMPLETENESS" in check[key]["name"].upper():
                                curr_count_comp +=1

                if curr_count_comp >= exp_count_comp:
                    st.write(':green[Great you are covered for the completeness dimension]')
                elif 0 < curr_count_comp < exp_count_comp:
                    st.write(f':orange[Please provide at least {exp_count_comp} completeness checks]')
                else:
                    st.write(':red[Did you even try?...]')
            else:
                st.write(":red[Please upload a check file]")

    with col_accuracy:
        with st.expander('Freshness'):
            if checks:
                for dataset_check in checks:
                    for check in checks[dataset_check]:
                        for key in check:
                            if "FRESHNESS" in check[key]["name"].upper():
                                curr_count_fresh +=1

                if curr_count_fresh >= 1:
                    st.write(':green[Great you are covered for the freshness dimension]')
                else:
                    st.write(f':red[Did you even try?... We need {exp_count_fresh} more.]')
            else:
                st.write(":red[Please upload a check file]")

    with col_consistency:
        with st.expander('Consistency'):
            if checks:
                for dataset_check in checks:
                    for check in checks[dataset_check]:
                        for key in check:
                            if "CONSISTENCY" in check[key]["name"].upper():
                                curr_count_cons +=1

                if curr_count_cons >= exp_count_cons:
                    st.write(':green[Great you are covered for the consistency dimension]')
                elif 0 < curr_count_cons < exp_count_cons:
                    st.write(f':orange[Please provide at least {exp_count_cons} consistency checks]')
                else:
                    st.write(':red[Did you even try?...]')
            else:
                st.write(":red[Please upload a check file]")

    with col_uniqueness:
        with st.expander('Uniqueness'):
            if checks:
                for dataset_check in checks:
                    for check in checks[dataset_check]:
                        for key in check:
                            if "UNIQUENESS" in check[key]["name"].upper():
                                curr_count_uniq +=1

                if curr_count_uniq >= exp_count_uniq:
                    st.write(':green[Great you are covered for the uniqueness dimension]')
                elif 0 < curr_count_uniq < exp_count_uniq:
                    st.write(f':orange[Please provide at least {exp_count_uniq} uniqueness checks]')
                else:
                    st.write(':red[Did you even try?...]')
            else:
                st.write(":red[Please upload a check file]")

    with col_validity:
        with st.expander('Validity'):
            if checks:
                for dataset_check in checks:
                    for check in checks[dataset_check]:
                        for key in check:
                            if "VALIDITY" in check[key]["name"].upper():
                                curr_count_val +=1

                if curr_count_val >= exp_count_val:
                    st.write(':green[Great you are covered for the validity dimension]')
                elif 0 < curr_count_val < exp_count_val:
                    st.write(f':orange[Please provide at least {exp_count_val} validity checks]')
                else:
                    st.write(':red[Did you even try?...]')
            else:
                st.write(":red[Please upload a check file]")
    steward_score = 0
    with col_stewardship:
        with st.expander('Stewardship'):
            if schema:
                for arg in contract_args:
                    if not schema.get(arg):
                        steward_score += 1
                        st.write(f':red[We need a {arg}]')
                if schema.get('columns'):
                    for col in schema.get('columns'):
                        if not schema.get('columns')[col].get('description') or not schema.get('columns')[col].get('type'):
                            st.write(f':red[Missing type or description for column {col}]')
            else:
                steward_score = len(contract_args)
                st.write(':red[Please provide a schema file.]')

    steward_score = (len(contract_args) - steward_score) / len(contract_args)
    dq_score = curr_count_comp/exp_count_comp + curr_count_cons/exp_count_cons + curr_count_fresh/exp_count_fresh + curr_count_uniq/exp_count_uniq + curr_count_val/exp_count_val
    st.header(f"DQ score: {(dq_score/5+steward_score)*50}%")

with dashboards:
    st.header('Dashboards')
    st.subheader('Results uploader')
    checks = []
    results_file = st.file_uploader(label='Results', type=['json'], key='results')

    data_health_bucket, freshness_bucket, cons_bucket, comp_bucket, val_bucket, uniquess_bucket = {'total': 0, 'fail': 0}, {'total': 0, 'fail': 0}, {'total': 0, 'fail': 0}, {'total': 0, 'fail': 0}, {'total': 0, 'fail': 0}, {'total': 0, 'fail': 0}
    data_health, freshness_col_graph, cons_col_graph = st.columns(3)
    comp_col_graph, val_col_graph, uniqueness_col_graph = st.columns(3)
    failed = False
    if results_file:
        results = json.load(results_file)
        failed = results.get('hasFailures')
        checks = results.get('checks')

    for result in checks:
        if 'FRESHNESS' in result.get('name').upper():
                freshness_bucket['total'] += 1
        if 'CONSISTENCY' in result.get('name').upper():
                cons_bucket['total'] += 1
        if 'COMPLETENESS' in result.get('name').upper():
                comp_bucket['total'] += 1
        if 'VALIDITY' in result.get('name').upper():
                val_bucket['total'] += 1
        if 'UNIQUENESS' in result.get('name').upper():
                uniquess_bucket['total'] += 1

        if result.get('outcome') == 'fail':
            data_health_bucket['fail'] += 1
            if 'FRESHNESS' in result.get('name').upper():
                freshness_bucket['fail'] += 1
            if 'CONSISTENCY' in result.get('name').upper():
                cons_bucket['fail'] += 1
            if 'COMPLETENESS' in result.get('name').upper():
                comp_bucket['fail'] += 1
            if 'VALIDITY' in result.get('name').upper():
                val_bucket['fail'] += 1
            if 'UNIQUENESS' in result.get('name').upper():
                uniquess_bucket['fail'] += 1

        data_health_bucket['total'] += 1


    with data_health:
        st.subheader("Data health")
        labels = 'Passed', 'Failed'
        sizes = [data_health_bucket['total']-data_health_bucket['fail'], data_health_bucket['fail']]
        if checks and data_health_bucket['total'] > 0:
            fig1, ax1 = plt.subplots()
            ax1.pie(sizes, labels=labels, colors=['green', 'red'])
            ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
            st.pyplot(fig1)

    with freshness_col_graph:
        st.subheader("Freshness")
        labels = 'Passed', 'Failed'
        sizes = [freshness_bucket['total']-freshness_bucket['fail'], freshness_bucket['fail']]
        if checks and freshness_bucket['total'] > 0:
            fig1, ax1 = plt.subplots()
            ax1.pie(sizes, labels=labels, colors=['green', 'red'])
            ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
            st.pyplot(fig1)

    with cons_col_graph:
        st.subheader("Consistency")
        labels = 'Passed', 'Failed'
        sizes = [cons_bucket['total']-cons_bucket['fail'], cons_bucket['fail']]
        if checks and cons_bucket['total'] > 0:
            fig1, ax1 = plt.subplots()
            ax1.pie(sizes, labels=labels, colors=['green', 'red'])
            ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
            st.pyplot(fig1)

    with comp_col_graph:
        st.subheader("Completeness")
        labels = 'Passed', 'Failed'
        sizes = [comp_bucket['total']-comp_bucket['fail'], comp_bucket['fail']]
        if checks and comp_bucket['total'] > 0:
            fig1, ax1 = plt.subplots()
            ax1.pie(sizes, labels=labels, colors=['green', 'red'])
            ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
            st.pyplot(fig1)

    with val_col_graph:
        st.subheader("Validity")
        labels = 'Passed', 'Failed'
        sizes = [val_bucket['total']-val_bucket['fail'], val_bucket['fail']]
        if checks and val_bucket['total'] > 0:
            fig1, ax1 = plt.subplots()
            ax1.pie(sizes, labels=labels, colors=['green', 'red'])
            ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
            st.pyplot(fig1)

    with uniqueness_col_graph:
        st.subheader("Uniqueness")
        labels = 'Passed', 'Failed'
        sizes = [uniquess_bucket['total']-uniquess_bucket['fail'], uniquess_bucket['fail']]
        if checks and uniquess_bucket['total'] > 0:
            fig1, ax1 = plt.subplots()
            ax1.pie(sizes, labels=labels, colors=['green', 'red'])
            ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
            st.pyplot(fig1)
	import streamlit as st
	import yaml, json
	import matplotlib.pyplot as plt

	dq_score, dashboards = st.tabs(['Data quality score', 'Dashboards'])

	with dq_score:
	st.title(f"""Data quality score""")

	schema, checks = {}, {}
	curr_count_comp, curr_count_cons, curr_count_fresh, curr_count_uniq, curr_count_val = 0, 0, 0, 0, 0
	exp_count_comp, exp_count_cons, exp_count_fresh, exp_count_uniq, exp_count_val = 3, 2, 1, 2, 4
	col_uploader_schema, col_uploader_checks = st.columns(2)
	col_completeness, col_accuracy, col_consistency = st.columns(3)
	col_validity, col_uniqueness, col_stewardship = st.columns(3)

	contract_args = ['dataset_name', 'columns', 'criticality', 'consumer', 'producer']

	with col_uploader_schema:
	st.header('Schema uploader')
	schema_file = st.file_uploader(label='Schema', type=['yaml', 'yml'], key='schema')
	if schema_file:
	schema = yaml.safe_load(schema_file)

	with col_uploader_checks:
	st.header('Checks uploader')
	checks_file = st.file_uploader(label='Checks', type=['yaml', 'yml'], key='checks')
	if checks_file:
	checks = yaml.safe_load(checks_file)

	with col_completeness:
	with st.expander('Completeness'):
	if checks:
	for dataset_check in checks:
	for check in checks[dataset_check]:
	for key in check:
	if "COMPLETENESS" in check[key]["name"].upper():
	curr_count_comp +=1

	if curr_count_comp >= exp_count_comp:
	st.write(':green[Great you are covered for the completeness dimension]')
	elif 0 < curr_count_comp < exp_count_comp:
	st.write(f':orange[Please provide at least {exp_count_comp} completeness checks]')
	else:
	st.write(':red[Did you even try?...]')
	else:
	st.write(":red[Please upload a check file]")

	with col_accuracy:
	with st.expander('Freshness'):
	if checks:
	for dataset_check in checks:
	for check in checks[dataset_check]:
	for key in check:
	if "FRESHNESS" in check[key]["name"].upper():
	curr_count_fresh +=1

	if curr_count_fresh >= 1:
	st.write(':green[Great you are covered for the freshness dimension]')
	else:
	st.write(f':red[Did you even try?... We need {exp_count_fresh} more.]')
	else:
	st.write(":red[Please upload a check file]")

	with col_consistency:
	with st.expander('Consistency'):
	if checks:
	for dataset_check in checks:
	for check in checks[dataset_check]:
	for key in check:
	if "CONSISTENCY" in check[key]["name"].upper():
	curr_count_cons +=1

	if curr_count_cons >= exp_count_cons:
	st.write(':green[Great you are covered for the consistency dimension]')
	elif 0 < curr_count_cons < exp_count_cons:
	st.write(f':orange[Please provide at least {exp_count_cons} consistency checks]')
	else:
	st.write(':red[Did you even try?...]')
	else:
	st.write(":red[Please upload a check file]")

	with col_uniqueness:
	with st.expander('Uniqueness'):
	if checks:
	for dataset_check in checks:
	for check in checks[dataset_check]:
	for key in check:
	if "UNIQUENESS" in check[key]["name"].upper():
	curr_count_uniq +=1

	if curr_count_uniq >= exp_count_uniq:
	st.write(':green[Great you are covered for the uniqueness dimension]')
	elif 0 < curr_count_uniq < exp_count_uniq:
	st.write(f':orange[Please provide at least {exp_count_uniq} uniqueness checks]')
	else:
	st.write(':red[Did you even try?...]')
	else:
	st.write(":red[Please upload a check file]")

	with col_validity:
	with st.expander('Validity'):
	if checks:
	for dataset_check in checks:
	for check in checks[dataset_check]:
	for key in check:
	if "VALIDITY" in check[key]["name"].upper():
	curr_count_val +=1

	if curr_count_val >= exp_count_val:
	st.write(':green[Great you are covered for the validity dimension]')
	elif 0 < curr_count_val < exp_count_val:
	st.write(f':orange[Please provide at least {exp_count_val} validity checks]')
	else:
	st.write(':red[Did you even try?...]')
	else:
	st.write(":red[Please upload a check file]")
	steward_score = 0
	with col_stewardship:
	with st.expander('Stewardship'):
	if schema:
	for arg in contract_args:
	if not schema.get(arg):
	steward_score += 1
	st.write(f':red[We need a {arg}]')
	if schema.get('columns'):
	for col in schema.get('columns'):
	if not schema.get('columns')[col].get('description') or not schema.get('columns')[col].get('type'):
	st.write(f':red[Missing type or description for column {col}]')
	else:
	steward_score = len(contract_args)
	st.write(':red[Please provide a schema file.]')

	steward_score = (len(contract_args) - steward_score) / len(contract_args)
	dq_score = curr_count_comp/exp_count_comp + curr_count_cons/exp_count_cons + curr_count_fresh/exp_count_fresh + curr_count_uniq/exp_count_uniq + curr_count_val/exp_count_val
	st.header(f"DQ score: {(dq_score/5+steward_score)*50}%")

	with dashboards:
	st.header('Dashboards')
	st.subheader('Results uploader')
	checks = []
	results_file = st.file_uploader(label='Results', type=['json'], key='results')

	data_health_bucket, freshness_bucket, cons_bucket, comp_bucket, val_bucket, uniquess_bucket = {'total': 0, 'fail': 0}, {'total': 0, 'fail': 0}, {'total': 0, 'fail': 0}, {'total': 0, 'fail': 0}, {'total': 0, 'fail': 0}, {'total': 0, 'fail': 0}
	data_health, freshness_col_graph, cons_col_graph = st.columns(3)
	comp_col_graph, val_col_graph, uniqueness_col_graph = st.columns(3)
	failed = False
	if results_file:
	results = json.load(results_file)
	failed = results.get('hasFailures')
	checks = results.get('checks')

	for result in checks:
	if 'FRESHNESS' in result.get('name').upper():
	freshness_bucket['total'] += 1
	if 'CONSISTENCY' in result.get('name').upper():
	cons_bucket['total'] += 1
	if 'COMPLETENESS' in result.get('name').upper():
	comp_bucket['total'] += 1
	if 'VALIDITY' in result.get('name').upper():
	val_bucket['total'] += 1
	if 'UNIQUENESS' in result.get('name').upper():
	uniquess_bucket['total'] += 1

	if result.get('outcome') == 'fail':
	data_health_bucket['fail'] += 1
	if 'FRESHNESS' in result.get('name').upper():
	freshness_bucket['fail'] += 1
	if 'CONSISTENCY' in result.get('name').upper():
	cons_bucket['fail'] += 1
	if 'COMPLETENESS' in result.get('name').upper():
	comp_bucket['fail'] += 1
	if 'VALIDITY' in result.get('name').upper():
	val_bucket['fail'] += 1
	if 'UNIQUENESS' in result.get('name').upper():
	uniquess_bucket['fail'] += 1

	data_health_bucket['total'] += 1




	with data_health:
	st.subheader("Data health")
	labels = 'Passed', 'Failed'
	sizes = [data_health_bucket['total']-data_health_bucket['fail'], data_health_bucket['fail']]
	if checks and data_health_bucket['total'] > 0:
	fig1, ax1 = plt.subplots()
	ax1.pie(sizes, labels=labels, colors=['green', 'red'])
	ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
	st.pyplot(fig1)

	with freshness_col_graph:
	st.subheader("Freshness")
	labels = 'Passed', 'Failed'
	sizes = [freshness_bucket['total']-freshness_bucket['fail'], freshness_bucket['fail']]
	if checks and freshness_bucket['total'] > 0:
	fig1, ax1 = plt.subplots()
	ax1.pie(sizes, labels=labels, colors=['green', 'red'])
	ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
	st.pyplot(fig1)

	with cons_col_graph:
	st.subheader("Consistency")
	labels = 'Passed', 'Failed'
	sizes = [cons_bucket['total']-cons_bucket['fail'], cons_bucket['fail']]
	if checks and cons_bucket['total'] > 0:
	fig1, ax1 = plt.subplots()
	ax1.pie(sizes, labels=labels, colors=['green', 'red'])
	ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
	st.pyplot(fig1)

	with comp_col_graph:
	st.subheader("Completeness")
	labels = 'Passed', 'Failed'
	sizes = [comp_bucket['total']-comp_bucket['fail'], comp_bucket['fail']]
	if checks and comp_bucket['total'] > 0:
	fig1, ax1 = plt.subplots()
	ax1.pie(sizes, labels=labels, colors=['green', 'red'])
	ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
	st.pyplot(fig1)

	with val_col_graph:
	st.subheader("Validity")
	labels = 'Passed', 'Failed'
	sizes = [val_bucket['total']-val_bucket['fail'], val_bucket['fail']]
	if checks and val_bucket['total'] > 0:
	fig1, ax1 = plt.subplots()
	ax1.pie(sizes, labels=labels, colors=['green', 'red'])
	ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
	st.pyplot(fig1)

	with uniqueness_col_graph:
	st.subheader("Uniqueness")
	labels = 'Passed', 'Failed'
	sizes = [uniquess_bucket['total']-uniquess_bucket['fail'], uniquess_bucket['fail']]
	if checks and uniquess_bucket['total'] > 0:
	fig1, ax1 = plt.subplots()
	ax1.pie(sizes, labels=labels, colors=['green', 'red'])
	ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
	st.pyplot(fig1)