Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
# ISSUE: Stylistically, I prefer to have a top-level controller and then
# form "views" which get laid out on the page in the order they were called.
# With just two forms, though, callback chaining is manageable and won’t result in a tangle,
# but the layout order is inverted, and in this callbacks solution the profile report and forms
# disappear when `cluster_duplicates` is finally called.
# (See this thread:
# For a better solution see:
import streamlit as st
from streamlit_pandas_profiling import st_profile_report
from streamlit import session_state as session
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
import streamlit_debug
streamlit_debug.set(flag=False, wait_for_client=True, host='localhost', port=8765)
def cluster_duplicates(df):
session_args_tuple = (session.col_name, session.dis_num, session.dis_non_alphanum, session.similarity, session.affinity)
col_name = session.col_name
dis_num = session.dis_num
dis_non_alphanum = session.dis_non_alphanum
similarity = session.similarity
affinity = session.affinity
@st.experimental_memo(show_spinner=True, persist='disk')
def get_profile_report(file_info, df):
pr = ProfileReport(df, explorative=True, lazy=True, minimal=True)
return pr
def profiler():
delim = session.delim
file = st.session_state.upload
delimiter = delim.split(" ")[1][1:-1]
df = pd.read_csv(file, sep=delimiter, engine="python")
file_info = {"Filename":, "FileType": file.type, "FileSize": file.size}
pr = get_profile_report(file_info, df)
cluster_duplicates_form = st.form(key="cluster_duplicates")
with cluster_duplicates_form:
cols = [val for val in df.columns]
col_name = st.selectbox("Select column for clustering", cols, key="col_name")
dis_num = st.checkbox("discard_numeric", key="dis_num")
dis_non_alphanum = st.checkbox("discard_nonalpha_numeric", key="dis_non_alphanum")
similarity ="Select Similarity Measure",
options=["levenshtein (recommended)", "cosine", "jaro_winkler", "trigram",
"levenshtein_partial"], key="similarity")
affinity ="Select Distance Measure",
options=["euclidean", "l1", "l2", "manhattan", "cosine", "precomputed"], key="affinity")
cluster_duplicates_form.form_submit_button(label="Cluster Duplicates", on_click=cluster_duplicates, args=(df,))
def data_uploader_form():
file_upload_form = st.form(key="file_upload")
with file_upload_form:
data_file = st.file_uploader("Upload File", type=['csv', 'xlsx'], key="upload")
delim_list = ["pipe (|)", r"tab (\t)", "comma (,)", "semicolon (;)"]
delim = st.selectbox("Select File Seperator/Delimiter", delim_list, key="delim")
file_upload_form.form_submit_button(label='Profile Data', on_click=profiler)
if __name__ =="__main__":
st.write("Data Profiler :wave:")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment