Skip to content

Instantly share code, notes, and snippets.

@pepijndevos
Created December 25, 2018 11:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pepijndevos/64e8672e22282ed5868502be7cc9b898 to your computer and use it in GitHub Desktop.
Save pepijndevos/64e8672e22282ed5868502be7cc9b898 to your computer and use it in GitHub Desktop.
Parse Whatsapp chats
import re
import sys
import glob
import pandas as pd
import matplotlib.pyplot as plt
regex = """(?P<datetime>\d{1,2}\/\d{1,2}\/\d{1,4}, \d{1,2}:\d{1,2}( (?i)[ap]m)*) - (?P<name>.*(?::\s*\w+)*|[\w\s]+?)(?:\s+(?P<action>joined|left|was removed|changed the (?:subject to "\w+"|group's icon))|:\s(?P<message>(?:.+|\n(?!\d{1,2}\/\d{1,2}\/\d{1,4}, \d{1,2}:\d{1,2}( (?i)[ap]m)*))+))"""
files = sys.argv[1:]
for fname in files:
with open(fname) as f:
text = f.read()
matches = re.findall(regex, text)
messages = []
for match in matches:
messages.append(match[::2])
df = pd.DataFrame(messages, columns=['datetime', 'name', 'message'])
df.datetime = pd.to_datetime(df.datetime, dayfirst=True)
df.set_index('datetime', inplace=True)
lengths = df.message.str.len()
monthly = lengths.groupby(pd.Grouper(freq='M')).count() #.sum()
#monthly = lengths.groupby(pd.Grouper(freq='D')).count().rolling('30d', min_periods=1).sum()
#plt.figure()
#ax = monthly.plot(title=fname[19:-4])
ax = monthly.plot()
ax.legend([fname[19:-4] for fname in files])
#plt.ylabel("bytes/month")
plt.ylabel("messages/month")
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment