Skip to content

Instantly share code, notes, and snippets.

@tohutohu
Created November 25, 2023 09:40
Show Gist options
  • Save tohutohu/7cf1d9f25cae426efb5a35d473c2e2d7 to your computer and use it in GitHub Desktop.
Save tohutohu/7cf1d9f25cae426efb5a35d473c2e2d7 to your computer and use it in GitHub Desktop.
import pandas as pd
import plotly.graph_objects as go
from tkinter import filedialog, Tk
import re
from collections import defaultdict
def select_file():
"""
Open a file dialog to select a log file.
"""
root = Tk()
root.withdraw()
file_path = filedialog.askopenfilename()
root.destroy()
return file_path
def process_log_data(file_path):
"""
Process the log data to prepare for the Sankey diagram.
"""
# Read the log data
df = pd.read_csv(file_path)
# Filter out anonymous users
# df = df[df['id'] != 'anonymous']
# Replace hex-like segments in URLs
# df['name'] = df['name'].apply(replace_hex_segments)
# Map URLs to nodes and count transitions
url_to_node = {url: idx for idx, url in enumerate(df['name'].unique())}
transitions = defaultdict(int)
for user_id in df['id'].unique():
user_data = df[df['id'] == user_id].sort_values(by='time')
for i in range(len(user_data) - 1):
source = url_to_node[user_data.iloc[i]['name']]
target = url_to_node[user_data.iloc[i + 1]['name']]
transitions[(source, target)] += 1
return url_to_node, transitions
def replace_hex_segments(url):
"""
Replace hex-like segments in URLs with a placeholder, maintaining the original delimiters.
"""
# Using regex to find all segments and delimiters
pattern = r'([/\.,\?&=]+)'
segments = re.split(pattern, url)
for i in range(0, len(segments), 2): # Skipping delimiters
segment = segments[i]
if re.fullmatch(r'[0-9a-fA-F\-_]+', segment):
segments[i] = '{param}'
return ''.join(segments) # Joining with the original delimiters
def create_sankey_diagram(url_to_node, transitions):
"""
Create a Sankey diagram using Plotly.
"""
nodes = list(url_to_node.keys())
links_source = [pair[0] for pair in transitions.keys()]
links_target = [pair[1] for pair in transitions.keys()]
links_value = list(transitions.values())
fig = go.Figure(data=[go.Sankey(
node=dict(
pad=15,
thickness=20,
line=dict(color="black", width=0.5),
label=nodes
),
link=dict(
source=links_source,
target=links_target,
value=links_value
))])
fig.update_layout(title_text="User Page Transitions", font_size=10)
fig.show()
# Main execution
file_path = select_file()
url_to_node, transitions = process_log_data(file_path)
create_sankey_diagram(url_to_node, transitions)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment