Created
December 14, 2021 15:20
-
-
Save asehmi/ecd408fa4c39976cb40b18d17a5a1b04 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ISSUE: Stylistically, I prefer to have a top-level controller and then | |
# form "views" which get laid out on the page in the order they were called. | |
# With just two forms, though, callback chaining is manageable and won’t result in a tangle, | |
# but the layout order is inverted, and in this callbacks solution the profile report and forms | |
# disappear when `cluster_duplicates` is finally called. | |
# | |
# (See this thread: https://discuss.streamlit.io/t/selectbox-bug-on-chaining-forms/19927) | |
# | |
# For a better solution see: multiform_chaining.py | |
import streamlit as st | |
from streamlit_pandas_profiling import st_profile_report | |
from streamlit import session_state as session | |
import numpy as np | |
import pandas as pd | |
from pandas_profiling import ProfileReport | |
import streamlit_debug | |
streamlit_debug.set(flag=False, wait_for_client=True, host='localhost', port=8765) | |
def cluster_duplicates(df): | |
session_args_tuple = (session.col_name, session.dis_num, session.dis_non_alphanum, session.similarity, session.affinity) | |
print(session_args_tuple) | |
col_name = session.col_name | |
dis_num = session.dis_num | |
dis_non_alphanum = session.dis_non_alphanum | |
similarity = session.similarity | |
affinity = session.affinity | |
st.write(col_name) | |
st.write(df.head()) | |
st.write(df[col_name].unique()) | |
@st.experimental_memo(show_spinner=True, persist='disk') | |
def get_profile_report(file_info, df): | |
pr = ProfileReport(df, explorative=True, lazy=True, minimal=True) | |
return pr | |
def profiler(): | |
delim = session.delim | |
file = st.session_state.upload | |
delimiter = delim.split(" ")[1][1:-1] | |
df = pd.read_csv(file, sep=delimiter, engine="python") | |
file_info = {"Filename": file.name, "FileType": file.type, "FileSize": file.size} | |
st.write(file_info) | |
pr = get_profile_report(file_info, df) | |
st_profile_report(pr) | |
cluster_duplicates_form = st.form(key="cluster_duplicates") | |
with cluster_duplicates_form: | |
cols = [val for val in df.columns] | |
col_name = st.selectbox("Select column for clustering", cols, key="col_name") | |
dis_num = st.checkbox("discard_numeric", key="dis_num") | |
dis_non_alphanum = st.checkbox("discard_nonalpha_numeric", key="dis_non_alphanum") | |
similarity = st.radio(label="Select Similarity Measure", | |
options=["levenshtein (recommended)", "cosine", "jaro_winkler", "trigram", | |
"levenshtein_partial"], key="similarity") | |
affinity = st.radio(label="Select Distance Measure", | |
options=["euclidean", "l1", "l2", "manhattan", "cosine", "precomputed"], key="affinity") | |
cluster_duplicates_form.form_submit_button(label="Cluster Duplicates", on_click=cluster_duplicates, args=(df,)) | |
def data_uploader_form(): | |
file_upload_form = st.form(key="file_upload") | |
with file_upload_form: | |
data_file = st.file_uploader("Upload File", type=['csv', 'xlsx'], key="upload") | |
delim_list = ["pipe (|)", r"tab (\t)", "comma (,)", "semicolon (;)"] | |
delim = st.selectbox("Select File Seperator/Delimiter", delim_list, key="delim") | |
file_upload_form.form_submit_button(label='Profile Data', on_click=profiler) | |
if __name__ =="__main__": | |
#st.set_page_config(layout="wide") | |
st.write("Data Profiler :wave:") | |
data_uploader_form() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment