Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
# ISSUE: Stylistically, I prefer to have a top-level controller and then
# form "views" which get laid out on the page in the order they were called.
# With just two forms, though, callback chaining is manageable and won’t result in a tangle,
# but the layout order is inverted, and in this callbacks solution the profile report and forms
# disappear when `cluster_duplicates` is finally called.
#
# (See this thread: https://discuss.streamlit.io/t/selectbox-bug-on-chaining-forms/19927)
#
# For a better solution see: multiform_chaining.py
import streamlit as st
from streamlit_pandas_profiling import st_profile_report
from streamlit import session_state as session
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
import streamlit_debug
streamlit_debug.set(flag=False, wait_for_client=True, host='localhost', port=8765)
def cluster_duplicates(df):
session_args_tuple = (session.col_name, session.dis_num, session.dis_non_alphanum, session.similarity, session.affinity)
print(session_args_tuple)
col_name = session.col_name
dis_num = session.dis_num
dis_non_alphanum = session.dis_non_alphanum
similarity = session.similarity
affinity = session.affinity
st.write(col_name)
st.write(df.head())
st.write(df[col_name].unique())
@st.experimental_memo(show_spinner=True, persist='disk')
def get_profile_report(file_info, df):
pr = ProfileReport(df, explorative=True, lazy=True, minimal=True)
return pr
def profiler():
delim = session.delim
file = st.session_state.upload
delimiter = delim.split(" ")[1][1:-1]
df = pd.read_csv(file, sep=delimiter, engine="python")
file_info = {"Filename": file.name, "FileType": file.type, "FileSize": file.size}
st.write(file_info)
pr = get_profile_report(file_info, df)
st_profile_report(pr)
cluster_duplicates_form = st.form(key="cluster_duplicates")
with cluster_duplicates_form:
cols = [val for val in df.columns]
col_name = st.selectbox("Select column for clustering", cols, key="col_name")
dis_num = st.checkbox("discard_numeric", key="dis_num")
dis_non_alphanum = st.checkbox("discard_nonalpha_numeric", key="dis_non_alphanum")
similarity = st.radio(label="Select Similarity Measure",
options=["levenshtein (recommended)", "cosine", "jaro_winkler", "trigram",
"levenshtein_partial"], key="similarity")
affinity = st.radio(label="Select Distance Measure",
options=["euclidean", "l1", "l2", "manhattan", "cosine", "precomputed"], key="affinity")
cluster_duplicates_form.form_submit_button(label="Cluster Duplicates", on_click=cluster_duplicates, args=(df,))
def data_uploader_form():
file_upload_form = st.form(key="file_upload")
with file_upload_form:
data_file = st.file_uploader("Upload File", type=['csv', 'xlsx'], key="upload")
delim_list = ["pipe (|)", r"tab (\t)", "comma (,)", "semicolon (;)"]
delim = st.selectbox("Select File Seperator/Delimiter", delim_list, key="delim")
file_upload_form.form_submit_button(label='Profile Data', on_click=profiler)
if __name__ =="__main__":
#st.set_page_config(layout="wide")
st.write("Data Profiler :wave:")
data_uploader_form()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment