Skip to content

Instantly share code, notes, and snippets.

View nicolasesnis's full-sized avatar
🤠
if the earth is flat then why is my life going downhill

Nicolas Esnis nicolasesnis

🤠
if the earth is flat then why is my life going downhill
View GitHub Profile
import seaborn as sns
import pandas as pd
import plotly.graph_objects as go
import chart_studio.plotly as py
import plotly
# Path to the raw data: https://gist.github.com/nicolasesnis/eb3b35545e97926ab53e0617c5e4b639
data = pd.read_csv('your/path/to/the/raw/data.csv')[
['user_id', 'time_install', 'event_name', 'time_event']]
import plotly.graph_objects as go
import chart_studio.plotly as py
import plotly
fig = go.Figure(data=[go.Sankey(
node=dict(
thickness=10, # default is 20
line=dict(color="black", width=0.5),
label=labels,
color=colors
targets = []
sources = []
values = []
time_to_next = []
for source_key, source_value in output['links_dict'].items():
for target_key, target_value in output['links_dict'][source_key].items():
sources.append(source_key)
targets.append(target_key)
values.append(target_value['unique_users'])
import seaborn as sns
# Working on the nodes_dict
all_events = list(data.event_name.unique())
# Create a set of colors that you'd like to use in your plot.
palette = ['50BE97', 'E4655C', 'FCC865',
'BFD6DE', '3E5066', '353A3E', 'E6E6E6']
# Here, I passed the colors as HEX, but we need to pass it as RGB. This loop will convert from HEX to RGB:
user_id event_name time_event rank_action next_action time_to_next
001... install 2020-03-05 15:48:00 1 signup 0 days 00:03:00
001... signup 2020-03-05 15:51:00 2 NaN NaT
003... install 2020-03-02 04:38:00 1 signup 0 days 00:04:00
003... signup 2020-03-02 04:42:00 2 reopen 2 days 02:42:00
003... reopen 2020-03-04 07:24:00 3 NaN NaT
import pandas as pd
data = pd.read_csv('raw.csv')[
['user_id', 'time_install', 'event_name', 'time_event']]
# Start with making sure that time_event and time_insrall are Pandas Datetime types:
data['time_event'] = pd.to_datetime(data['time_event'], unit='s') # Unit = "s" is required here since the raw data format is unique timestamp. Remove this param if your data is already a datetime like data.
data['time_install'] = pd.to_datetime(data['time_install'], unit='s')
# Make sure that there's no event occurring before time_install
We can't make this file beautiful and searchable because it's too large.
user_id,time_install,event_name,time_event
4c6065c9466bc68e324e316edfb0227ff7cccc6c,43892.9781365741,purchase,43892.9838078704
f3049eac4788ffd4482390f8333d7e1adbf4c5a1,43896.2203703704,signup,43896.2207986111
f3049eac4788ffd4482390f8333d7e1adbf4c5a1,43896.2203703704,purchase,43896.2218865741
9c1e35e89a374207409ad05da6d69d43f427c5f2,43896.7880555556,reopen,43896.1077083333
723fba1295b9a7c8321bbc433f87629b90660582,43892.0000462963,reopen,43893.5774884259
723fba1295b9a7c8321bbc433f87629b90660582,43892.0000462963,signup,43892.0025231481
97963ab19ae86316d639e3a181a156c6c9dd5be0,43892.9100231481,purchase,43893.0026157407
97963ab19ae86316d639e3a181a156c6c9dd5be0,43892.9100231481,signup,43892.9137152778
97963ab19ae86316d639e3a181a156c6c9dd5be0,43892.9100231481,purchase,43892.9711689815
user_id time_install event_name time_event
4c6065c9466bc6ad... 2020-03-02 23:28:31 purchase 2020-03-02 23:36:41
f3049eac4788ffak... 2020-03-06 05:17:20 signup 2020-03-06 05:17:57
f3049eac4788ffak... 2020-03-06 05:17:20 purchase 2020-03-06 05:19:31
9c1e35e89a3742e1... 2020-03-06 18:54:48 reopen 2020-03-06 02:35:06
723fba1295b9a7fb... 2020-03-02 00:00:04 reopen 2020-03-03 13:51:35