Skip to content

Instantly share code, notes, and snippets.

@not7cd
Created March 6, 2019 21:23
Show Gist options
  • Save not7cd/99ed913b419c461a9d05e20205ae73dc to your computer and use it in GitHub Desktop.
Save not7cd/99ed913b419c461a9d05e20205ae73dc to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# coding: utf-8
"""
Messenger history analyzer and plotter
Short script to analyze past messages and create stackplot from them over time
USAGE
1. Download facebook data in json format
2. `cd` to facebook-your-name/messages
3. Run script here
"""
import numpy as np
import matplotlib.pyplot as plt
import glob
import json
import pandas as pd
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
def aggregate_msg_count(senders, participants):
tmp = []
for p in participants:
single = senders[(senders["sender_name"] == p) & (senders["type"] == "Generic")]
single = single[["sender_name"]]
single["sender_name"].replace(p, 1, inplace=True)
single.rename(columns={"sender_name": p}, inplace=True)
count = single.groupby(pd.Grouper(freq="D"))
tmp.append(count.sum())
out = pd.concat(tmp, axis=1)
return out
def aggregate_word_count(senders, participants):
tmp = []
for p in participants:
single = senders[(senders["sender_name"] == p) & (senders["type"] == "Generic")]
single["content"] = single["content"].apply(lambda s: len(str(s).split(" ")))
single = single[["content"]]
single.rename(columns={"content": p}, inplace=True)
count = single.groupby(pd.Grouper(freq="D"))
tmp.append(count.sum())
out = pd.concat(tmp, axis=1)
return out
def extract_msg_count(msgs, ignore=["Your Name"]):
df = pd.DataFrame(msgs["messages"])
df_ = df[["timestamp_ms", "sender_name", "content", "type"]]
dti = pd.to_datetime(df_["timestamp_ms"].tolist(), unit="ms")
df_.index = dti
participants = [m["name"] for m in msgs["participants"] if m["name"] not in ignore]
if len(participants) > 3:
print(len(participants))
raise ValueError("too big xd")
ppl = aggregate_word_count(df_, participants)
return ppl
def aggregate(df, top=12, freq="M"):
"""aggregate by chosen time span, return top"""
dfg = df.groupby(pd.Grouper(freq=freq)).sum()
top_cols = dfg.sum().sort_values(ascending=False)[:top]
top = dfg.loc[:, dfg.columns.isin(top_cols.index.tolist())]
return top.reindex(top_cols.index[::-1], axis=1)
def stackplot_messages(df, legend=False):
df = df.resample("D").interpolate(method="pchip")
plt.style.use("default")
fig, ax = plt.subplots(figsize=(20, 6))
n_lines = 12
x = np.linspace(0, 10)
phase_shift = np.linspace(0, np.pi, n_lines)
ax.set_prop_cycle("color", [plt.cm.summer(i) for i in np.linspace(0, 1, n_lines)])
ax.stackplot(df.index.values, df.T, baseline="wiggle", labels=df.columns.values)
for s in ax.spines:
ax.spines[s].set_visible(False)
ax.yaxis.set_visible(False)
if legend:
ax.legend(loc="upper left")
plt.show()
def collect_messages(files):
chats = None
for f in files:
with open(f) as fp:
msgs = json.load(fp)
print(f)
try:
tmp = extract_msg_count(msgs)
except Exception as e:
print(e, f)
continue
if chats is None:
chats = tmp
else:
try:
chats = pd.concat([chats, tmp], axis=0, sort=True).fillna(0)
chats = chats.groupby(chats.index).sum()
except Exception as e:
print(e, tmp)
return chats
def main():
files = glob.glob("**/message.json", recursive=True)
df = collect_messages(files)
df = aggregate(df)
stackplot_messages(df)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment