-
-
Save aussetg/3dbdc615d844b36253805936fea21efe to your computer and use it in GitHub Desktop.
Downloads, archives, analyzes and plots Facebook Messenger conversations (individual and group)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import collections | |
import copy | |
import datetime | |
import functools | |
import itertools | |
import json | |
import lzma | |
import gzip | |
import operator | |
import os | |
import pprint | |
import sys | |
import time | |
import re | |
import sqlite3 | |
import glob | |
import tempfile | |
import shutil | |
import getpass | |
import argparse | |
import math | |
import contextlib | |
import requests | |
import lxml.html | |
import numpy | |
try: | |
import matplotlib | |
import matplotlib.pyplot as plot | |
except ImportError: | |
matplotlib = plot = print('Failed to import matplotlib, plotting will not be available.') | |
try: | |
import wordcloud | |
except ImportError: | |
wordcloud = print('Failed to import wordcloud, word clouds will not be available.') | |
try: | |
import termgraph | |
except ImportError: | |
termgraph = print('Failed to import termgraph, terminal graphs will not be displayed.') | |
try: | |
import selenium | |
from selenium import webdriver | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.common.action_chains import ActionChains | |
except: | |
webdriver = print('Failed to import selenium, browser authentication will not be available.') | |
idToUserMap = { | |
'100004322110944': 'Rowena', | |
'1338262658': 'Sushain', | |
'100004252695708': 'Michelle N.', | |
'100009196845865': 'Brittany', | |
'100003355055997': 'Karen', | |
'100000534453859': 'Ashley', | |
'100004476751719': 'Benji', | |
'1626215140': 'Maxine', | |
'777993547': 'Jean', | |
'1311112684': 'Keerthana', | |
'1412264090': 'Michelle H.', | |
'100002628181062': 'Pallavi', | |
'100001184052364': 'Shreya', | |
'100000284933876': 'Ani', | |
'100002398916527': 'Amisha', | |
'100006790448156': 'Serena', | |
'100002878482600': 'Tiffany', | |
'100002576434633': 'Snigdha', | |
'1333603699': 'Saloni', | |
'1841753743': 'Christina', | |
'100003127069904': 'Tiffany Do', | |
'1253817276': 'Alexander', | |
'100000986269083': 'Prachi', | |
'100000241495175': 'Eric' | |
} | |
groups = [494248544089735, 1513200892278424, 322604171221575, 1021123947914529, | |
879550675408978, 940010492714431, 1700273163527834, 1097674336985252] | |
profanity = [r'\bfuck+(?:ing|ed|er)?\b', r'\b(?:dip)?shit+(?:ty+)?\b', r'\bdamn(?:it+)?\b', | |
r'\bgoddamn\b', r'\bdick\b', r'\bbullshit+\b', r'\bbastard\b', r'\bhell+\b', | |
r'\bbitch(?:ass)?\b', r'\bass+\b', r'\ba(?:ss)?hole\b', r'\bmotherfucker\b'] | |
Session = collections.namedtuple('Session', ['username', 'id', 'session', 'dtsg']) | |
session = Session(None, None, None, None) | |
def idToUser(userid): | |
userid = str(userid).replace('fbid:', '') | |
if userid in idToUserMap: | |
return idToUserMap[userid] | |
else: | |
return 'Unknown_' + str(hash(str(userid))).replace('-', '')[:4] | |
def userToId(user): | |
return dict(zip(map(str.lower, idToUserMap.values()), idToUserMap.keys())).get(user.lower()) | |
fields = { | |
'text': ['thread_id', 'threading_id', 'author_email', 'message_id', 'folder', 'source', 'html_body', 'author', 'subject', 'body', 'action_id', 'action_type', 'other_user_fbid', 'forward_count', 'offline_threading_id', 'log_message_type', 'log_message_body', 'location_text'], | |
'json': ['forward_message_ids', 'raw_attachments', 'attachments', 'ephemeral_ttl_mode', 'ranges', 'source_tags', 'tags', 'log_message_data', 'coordinates'], | |
'boolean': ['has_attachment', 'is_spoof_warning', 'is_filtered_content', 'is_filtered_content_invalid_app', 'is_filtered_content_account', 'is_forward', 'is_unread', 'is_filtered_content_bh', 'is_filtered_content_quasar', 'is_sponsored'] | |
} | |
def insertMessages(cursor, messages, conversationID): | |
conversationID = int(conversationID) | |
numFields = 4 + len(fields['text']) + len(fields['json']) + len(fields['boolean']) | |
def formatMessageJSON(conversationID, message): | |
messageFields = [None, session.username, conversationID] | |
if int(message['thread_fbid']) != conversationID: | |
# TODO: Figure out why this is happening and how to properly filter | |
return print(conversationID, 'ignoring message', message['thread_fbid']) | |
assert int(message['thread_fbid']) == conversationID | |
del message['thread_fbid'] | |
for text_field in fields['text']: | |
messageFields.append(str(message.pop(text_field, ''))) | |
for json_field in fields['json']: | |
messageFields.append(json.dumps(message.pop(json_field, None))) | |
for boolean_field in fields['boolean']: | |
messageFields.append(bool(message.pop(boolean_field, False))) | |
messageFields.append(int(message['timestamp'])) | |
del message['timestamp'] | |
for field in ['timestamp_relative', 'timestamp_datetime', 'timestamp_absolute', 'timestamp_time_passed']: | |
del message[field] | |
for field in ['platform_xmd', 'meta_ranges', 'commerce_message_type', 'customizations', 'message_source', 'montage_reply_data', 'skip_bump_thread']: | |
message.pop(field, None) | |
if len(message.items()): | |
print(message) | |
assert len(message.items()) == 0 | |
return messageFields | |
formattedMessages = list(filter(lambda x: x is not None, map(lambda x: formatMessageJSON(conversationID, x), messages))) | |
cursor.executemany('INSERT INTO fb_messages VALUES (%s)' % ', '.join(['?'] * numFields), formattedMessages) | |
def initDB(dbPath, fbChatsPath=None): | |
conn = sqlite3.connect(dbPath) | |
cursor = conn.cursor() | |
cursor.execute('''CREATE TABLE fb_messages ( | |
id INTEGER PRIMARY KEY AUTOINCREMENT, | |
account TEXT, | |
conversation_id UNSIGNED BIG INT, | |
%s, | |
%s, | |
%s, | |
timestamp DATETIME | |
)''' % ( | |
', '.join(map(lambda x: '%s TEXT' % x, fields['text'])), | |
', '.join(map(lambda x: '%s_json TEXT' % x, fields['json'])), | |
', '.join(map(lambda x: '%s BOOLEAN' % x, fields['boolean'])) | |
)) | |
if fbChatsPath: | |
for fname in glob.glob(fbChatsPath % '*'): | |
conversationID = int(os.path.basename(fname).replace('.json.xz', '')) | |
with lzma.open(fname) as f: | |
messages = json.loads(f.read().decode('utf-8')) | |
insertMessages(cursor, messages, conversationID) | |
conn.commit() | |
conn.close() | |
with gzip.open(dbPath.replace('.db', '.db.gz'), 'wb') as db_compressed: | |
with open(dbPath, 'rb') as db_uncompressed: | |
db_compressed.writelines(db_uncompressed) | |
os.remove(dbPath) | |
def getNewMessages(conversationID, group=False, oldMessages=None, limit=2000): | |
global session | |
if not (session.session and session.dtsg and session.id): | |
session = login() | |
s, dtsg = session.session, session.dtsg | |
print(('\nFetching messages from conversation %s' % conversationID) + (' (%s).' % idToUser(conversationID) if not group else '.')) | |
if oldMessages: | |
newestMessageDate = datetime.datetime.fromtimestamp(oldMessages[-1]['timestamp'] / 1e3) | |
print('%s messages currently downloaded.' % len(oldMessages)) | |
startTime = datetime.datetime.now() | |
newMessages = [] | |
offset = 0 | |
messagesType = 'user_ids' if not group else 'thread_fbids' | |
newestTimestamp = '' | |
failures = 0 | |
while True: | |
try: | |
data = { | |
'messages[%s][%s][offset]' % (messagesType, conversationID): offset, | |
'messages[%s][%s][limit]' % (messagesType, conversationID): limit, | |
'messages[%s][%s][timestamp]' % (messagesType, conversationID): newestTimestamp, | |
'client': 'mercury', | |
'__user': session.id, | |
'__a': '1', | |
'__dyn': '7AmanEzUFlym5Q9UoHbgWy1m9JaUK5EK8GAFp8yupFLO0xBxvyui9zob4q8zUK5Uc-dy88awF-qp7yoSy28Oi9x2rmEWfSiVWxeUlG4oCi4aDixa26inzpoS6rCgKmLF5Dxm68', | |
'__req': '6', | |
'fb_dtsg': dtsg, | |
'ttstamp': '265817110078561197411111711395', | |
'__rev': '2046448' | |
} | |
#pprint.pprint(data) | |
t = s.post('https://www.facebook.com/ajax/mercury/thread_info.php', data=data).text | |
t = json.loads(t[t.index('{'):t.rindex('}')+1])['payload']['actions'] | |
if newestTimestamp: | |
t = t[:-1] | |
newestTimestamp = t[0]['timestamp'] | |
except TypeError: | |
failures += 1 | |
print('Failed to fetch messages at offset %s with limit %s (failure #%s).' % (offset, limit, failures)) | |
if failures > 2: | |
print('Changing limit from %s to %s.' % (limit, limit / 2)) | |
limit /= 2 | |
failures = 0 | |
if limit < 10: | |
print('Giving up after fetching %s messages.' % len(newMessages)) | |
continue | |
failures = 0 | |
newMessages = t + newMessages | |
offset += limit | |
oldestMessageDate = datetime.datetime.fromtimestamp(newMessages[0]['timestamp'] / 1e3) | |
if len(t) < limit or (oldMessages and oldestMessageDate < newestMessageDate): | |
print('Completed fetching %s messages in conversation %s.' % (len(newMessages), conversationID)) | |
break | |
else: | |
print('Fetched %s messages, offset at %s, fetched %s messages so far.' % (limit, offset, len(newMessages))) | |
if oldMessages: | |
newMessages = list(filter(lambda x: datetime.datetime.fromtimestamp(x['timestamp'] / 1e3) > newestMessageDate, newMessages)) | |
print('Added %s messages to existing %s messages for a total of %s.' % (len(newMessages), len(oldMessages), len(newMessages) + len(oldMessages))) | |
endTime = datetime.datetime.now() | |
print('The data retrieval took {} seconds.'.format(endTime - startTime)) | |
return newMessages | |
def getMessages(cursor, conversationID, query=None, regularExpression=False, caseSensitive=False): | |
if query and not regularExpression: | |
if caseSensitive: | |
messages = cursor.execute(''' | |
SELECT author, timestamp, body, has_attachment, attachments_json | |
FROM fb_messages | |
WHERE conversation_id = ? AND body LIKE ? | |
ORDER BY timestamp''', | |
(conversationID, '%%%s%%' % query) | |
).fetchall() | |
else: | |
messages = cursor.execute(''' | |
SELECT author, timestamp, body, has_attachment, attachments_json | |
FROM fb_messages | |
WHERE conversation_id = ? AND LOWER(body) LIKE ? | |
ORDER BY timestamp''', | |
(conversationID, '%%%s%%' % query.lower()) | |
).fetchall() | |
else: | |
messages = cursor.execute(''' | |
SELECT author, timestamp, body, has_attachment, attachments_json | |
FROM fb_messages | |
WHERE conversation_id = ? | |
ORDER BY timestamp''', | |
(conversationID, ) | |
).fetchall() | |
if regularExpression: | |
regex = re.compile(query, flags=(0 if caseSensitive else re.IGNORECASE)) | |
messages = list(filter(lambda x: bool(regex.search(x[2])), messages)) | |
return list(map(lambda x: { | |
'author': x[0], | |
'timestamp': x[1], | |
'body': x[2], | |
'has_attachment': bool(x[3]), | |
'attachments': json.loads(x[4]) | |
}, messages)) | |
def allDaysSpan(oldest, newest): | |
allDays = [] | |
startDate = oldest.date() | |
while startDate <= newest.date(): | |
allDays.append(startDate) | |
startDate = startDate + datetime.timedelta(days=1) | |
return allDays | |
#@profile | |
def messagesStats(messages, plotMessageCount=False, plotCumulativeMessageCount=False, wordClouds=False, limitPlotToStreak=False): | |
startTime = datetime.datetime.now() | |
oldest, newest = datetime.datetime.max, datetime.datetime.min | |
messageCounts = collections.defaultdict(lambda: collections.Counter({'sticker': 0, 'text': 0, 'other': 0})) | |
messageContents = collections.defaultdict(list) | |
daysSpoken = set() | |
daysMessages = collections.defaultdict(lambda: collections.defaultdict(int)) | |
stickerCounts = collections.defaultdict(lambda: collections.Counter()) | |
responseTimes = collections.defaultdict(list) | |
messageStreaks = [] | |
users = set() | |
lastMessageUser = None | |
currentMessageStreak = 0 | |
lastTimestamp = None | |
for message in messages: | |
date = datetime.datetime.fromtimestamp(message['timestamp'] / 1e3) | |
oldest = min(oldest, date) | |
newest = max(newest, date) | |
user = idToUser(message['author']) | |
daysMessages[date.date()][user] += 1 | |
daysSpoken.add(date.date()) | |
users.add(user) | |
if lastMessageUser == user: | |
currentMessageStreak += 1 | |
lastTimestamp = date | |
else: | |
if lastMessageUser: | |
messageStreaks.append((lastMessageUser, currentMessageStreak)) | |
lastMessageUser = user | |
currentMessageStreak = 1 | |
if lastTimestamp: | |
responseTimes[user].append((date - lastTimestamp).total_seconds()) | |
lastTimestamp = date | |
if 'body' in message: | |
text = message['body'] | |
messageCounts[user]['all'] += 1 | |
if text and len(text): | |
messageCounts[user]['text'] += 1 | |
messageContents[user].append(text) | |
else: | |
if message['has_attachment'] and 'attach_type' in message['attachments'][0] and message['attachments'][0]['attach_type'] == 'sticker': | |
messageCounts[user]['sticker'] += 1 | |
stickerCounts[user][message['attachments'][0]['url']] += 1 | |
else: | |
messageCounts[user]['other'] += 1 | |
else: | |
pass # print(message['log_message_body']) | |
print('Conversations amongst %s between %s and %s:\n' % (' & '.join(users), oldest, newest)) | |
messageContent = dict(map(lambda x: (x[0], '\n'.join(x[1])), messageContents.items())) | |
totalCounts = collections.Counter({'sticker': 0, 'text': 0}) | |
for person, counts in messageCounts.items(): | |
totalCounts['sticker'] += counts['sticker'] | |
totalCounts['text'] += counts['text'] | |
totalCounts['all'] += counts['all'] | |
print('%s sent %s total messages, %s text messages (%.2f%%) and %s stickers (%.2f%%). On average, the text messages were %.2f characters long which makes for a total of %s characters.' % (person, counts['all'], counts['text'], float(counts['text']) / counts['all'] * 100, counts['sticker'], float(counts['sticker']) / counts['all'] * 100, float(len(messageContent[person])) / counts['text'], len(messageContent[person]))) | |
topMessages = dict(map(lambda x: (x[0], sorted(x[1].items(), key=operator.itemgetter(1), reverse=True)[0][0]), daysMessages.items())) | |
topMessagesCounts = sorted(list(collections.Counter(topMessages.values()).items()), key=operator.itemgetter(1), reverse=True) | |
if len(topMessagesCounts) == 1: | |
print('%s talked the most every day...' % topMessagesCounts[0][0]) | |
else: | |
print('%s talks the most, with %s day(s) when they sent the most messages, and %s is the quiet one with %s day(s).' % (topMessagesCounts[0][0], topMessagesCounts[0][1], topMessagesCounts[1][0], topMessagesCounts[1][1])) | |
print('\nSo, a total of %s messages, %s text messages (%.2f%%) and %s stickers (%.2f%%).' % (totalCounts['all'], totalCounts['text'], float(totalCounts['text'])/totalCounts['all'] * 100, totalCounts['sticker'], float(totalCounts['sticker']) / totalCounts['all'] * 100)) | |
allDays = allDaysSpan(oldest, newest) | |
print('That makes for an average of %.2f messages per day!' % (float(totalCounts['all']) / len(allDays))) | |
print('Over the span of %s day(s), %s day(s) went without conversation (%.2f%%).' % (len(allDays), len(set(allDays) - daysSpoken), float(len(set(allDays) - daysSpoken)) / len(allDays) * 100)) | |
print('So, if we take that into account, it makes for an average of %.2f messages on days with conversation!' % (float(totalCounts['all']) / len(daysSpoken))) | |
profanityCounts = collections.defaultdict(dict) | |
for user in users: | |
for word in profanity: | |
matches = re.findall(word, messageContent[user], flags=re.IGNORECASE) | |
if matches: | |
mostCommon = collections.Counter(map(str.lower, matches)).most_common(1)[0] | |
profanityCounts[user][mostCommon[0]] = mostCommon[1] | |
profanityTotalCounts = list(reversed(sorted(list(map(lambda x: (x[0], sum(x[1].values())), profanityCounts.items())), key=operator.itemgetter(1)))) | |
print('\n%s has the potty mouth with %s profane word(s) said whereas %s.' % (profanityTotalCounts[0][0], profanityTotalCounts[0][1], ', '.join(map(lambda x: '%s has said %s profane word(s)' % x, profanityTotalCounts[1:])))) | |
for user in sorted(users, key=lambda x: - dict(profanityTotalCounts).get(x, 0)): | |
userProfanityCounts = list(reversed(sorted(profanityCounts[user].items(), key=operator.itemgetter(1)))) | |
if userProfanityCounts: | |
print('%s\'s profanity of choice seems to be "%s" (%s occurences), they\'re also fans of %s.' % (user, userProfanityCounts[0][0], userProfanityCounts[0][1], ', '.join(map(lambda x: '"%s" (%s)' % x, userProfanityCounts[1:])) or 'apparently not much else')) | |
else: | |
print('%s hasn\'t been the slightest bit profane.' % user) | |
print('\nJust in case you\'re curious, the most eventful day was %s, when %s messages were sent :D' % tuple(max(map(lambda x: (x[0], functools.reduce(lambda s, a: s + a[1], x[1].items(), 0)), daysMessages.items()), key=operator.itemgetter(1)))) | |
longestseq, currentseq = [], [] | |
for day in sorted(list(daysSpoken)): | |
if len(currentseq) > len(longestseq): | |
longestseq = copy.copy(currentseq) | |
if currentseq and currentseq[-1] + datetime.timedelta(days=1) == day: | |
currentseq.append(day) | |
else: | |
currentseq = [day] | |
if len(currentseq) > len(longestseq): | |
longestseq = copy.copy(currentseq) | |
print('The longest streak of days with at least one message lasted %s days, from %s to %s!' % (len(longestseq), longestseq[0], longestseq[-1])) | |
if currentseq and datetime.datetime.now().date() - currentseq[-1] <= datetime.timedelta(days=1): | |
print('On the other hand, the current streak is %s days, from %s to %s.' % (len(currentseq), currentseq[0], currentseq[-1])) | |
else: | |
print('On the other hand, the current streak is 0 days, you haven\'t conversed since %s :(' % currentseq[-1]) | |
print('\nNow, on to stickers. There were an average of %.2f stickers used on days with conversation!' % (float(totalCounts['sticker']) / len(daysSpoken))) | |
for user in users: | |
print('Out of %s\'s %s stickers, the five most used were: ' % (user, messageCounts[user]['sticker']) + ', '.join(list(map(lambda x: '%s (%s)' % x, stickerCounts[user].most_common(5))))) | |
messageStreaksPerUser = {} | |
for user in users: | |
messageStreaksPerUser[user] = collections.Counter(map(operator.itemgetter(1), filter(lambda x: x[0] == user, messageStreaks))) | |
if len(users) == 2 and len(messageStreaks) > 1: | |
print('\nSince there are only two people in this conversation, we can do some more calculations!') | |
user1 = messageStreaks[0][0] | |
user2 = messageStreaks[1][0] | |
sum1, num1, sum2, num2 = 0, 0, 0, 0 | |
lastMessageStreak = (None, 0) | |
for messageStreak in messageStreaks: | |
if lastMessageStreak[0] == user1 and messageStreak[0] == user2: | |
sum1 += messageStreak[1] / lastMessageStreak[1] | |
num1 += 1 | |
elif lastMessageStreak[0] == user2 and messageStreak[0] == user1: | |
sum2 += messageStreak[1] / lastMessageStreak[1] | |
num2 += 1 | |
lastMessageStreak = messageStreak | |
print('%s sends %.2f consecutive message on average and for each message, %s responds with %.2f messages on average.' % (user1, numpy.average(list(messageStreaksPerUser[user1].keys()), weights=list(messageStreaksPerUser[user1].values())), user2, sum1 / num1)) | |
print('On the other hand, %s sends %.2f consecutive message on average and for each message, %s responds with %.2f messages on average.' % (user2, numpy.average(list(messageStreaksPerUser[user2].keys()), weights=list(messageStreaksPerUser[user2].values())), user1, sum2 / num2)) | |
print('When %s sends a message, %s tends to respond in %.1f seconds (median response time).' % (user1, user2, numpy.median(responseTimes[user2]))) | |
print('On the other hand, when %s sends a message, %s tends to respond in %.1f seconds (median response time).' % (user2, user1, numpy.median(responseTimes[user1]))) | |
endTime = datetime.datetime.now() | |
print('\nThe data compilation took {} seconds.'.format(endTime - startTime)) | |
colors = ['b', 'r', 'g', 'c'] | |
if plotMessageCount or plotCumulativeMessageCount: | |
daysMessagesList = sorted(daysMessages.items(), key=operator.itemgetter(0)) | |
fig = plot.figure() | |
subplotCount = len(list(filter(operator.truth, [plotMessageCount, plotCumulativeMessageCount]))) | |
if plotMessageCount: | |
ax1 = fig.add_subplot(subplotCount, 1, 1) | |
plot.xlabel('Date') | |
plot.ylabel('Quantity') | |
plot.title('Number of Messages') | |
plots1 = [] | |
if plotCumulativeMessageCount: | |
ax2 = fig.add_subplot(subplotCount, 1, 2 if plotMessageCount else 1) | |
plot.xlabel('Date') | |
plot.ylabel('Quantity') | |
plot.title('Number of Messages over Time') | |
plots2 = [] | |
for i, user in enumerate(users): | |
userMessages = list(map(lambda x: (x[0], x[1][user]), filter(lambda y: user in y[1], daysMessagesList))) | |
userDays = list(map(operator.itemgetter(0), userMessages)) | |
for day in filter(lambda x: x not in userDays, allDays): | |
userMessages.append((day, 0)) | |
userMessages = sorted(userMessages, key=operator.itemgetter(0)) | |
if limitPlotToStreak: | |
userMessages = list(filter(lambda x: x[0] >= longestseq[0] and x[0] <= longestseq[-1], userMessages)) | |
if plotMessageCount: | |
plt, = ax1.plot(*zip(*userMessages), '.%s-' % colors[i % len(colors)], label=user) | |
plots1.append(plt) | |
if plotCumulativeMessageCount: | |
cumulativeUserMessages = list(itertools.accumulate(userMessages, func=lambda x, y: (y[0], x[1] + y[1]))) | |
plt, = ax2.plot(*zip(*cumulativeUserMessages), '.%s-' % colors[i % len(colors)], label=user+' (cumulative)') | |
plots2.append(plt) | |
if plotMessageCount: | |
ax1.legend(handles=plots1) | |
if plotCumulativeMessageCount: | |
ax2.legend(handles=plots2, loc='lower right') | |
plot.show() | |
if wordClouds: | |
wordcloud.STOPWORDS.update(["T", "t", "P", ":P", "im", "p", 'http', 'https', 'd', 'o']) | |
wordcloud.STOPWORDS.update(["u", "ur", "i"]) | |
wordcloud.STOPWORDS.update(["T", "t", "P", ":P", "lol", "LOL", "yeah", "okay", "oh", "im", "p", 'http', 'https', 'd', 'o', 'want', 'go', 'png', 'skc']) | |
wordcloud.STOPWORDS.update(['dont', 'hes', 'whens', 'weve', 'hed', 'theres', 'havent', 'theyll', 'whos', 'theyd', 'youve', 'well', 'theyve', 'wont', 'mustnt', 'isnt', 'ill', 'whys', 'youd', 'wasnt', 'shouldnt', 'youre', 'arent', 'id', 'werent', 'im', 'cant', 'hadnt', 'couldnt', 'doesnt', 'hows', 'its', 'wheres', 'ive', 'didnt', 'whats', 'heres', 'theyre', 'hasnt', 'wouldnt', 'wed', 'shant', 'lets', 'hell', 'shed', 'youll', 'were', 'shes', 'thats']) | |
wordcloud.STOPWORDS.update(['think', 'make', 'one', 'wait', 'people']) | |
fig = plot.figure() | |
if len(users) > 5: | |
allContent = '\n'.join(messageContent.values()) | |
wc = wordcloud.WordCloud(background_color='white', max_words=500, stopwords=wordcloud.STOPWORDS, font_path='/Library/Fonts/Futura.ttc', width=1000, height=1500) | |
wc.generate(allContent) | |
f = fig.add_subplot(1, 2, 1) | |
f.axes.get_xaxis().set_visible(False) | |
f.axes.get_yaxis().set_visible(False) | |
f.set_title('Everyone') | |
plot.imshow(wc) | |
if session.id: | |
wc = wordcloud.WordCloud(background_color='white', max_words=500, stopwords=wordcloud.STOPWORDS, font_path='/Library/Fonts/Futura.ttc', width=1000, height=1500) | |
wc.generate(messageContent[idToUser(session.id)]) | |
f = fig.add_subplot(1, 2, 2) | |
f.axes.get_xaxis().set_visible(False) | |
f.axes.get_yaxis().set_visible(False) | |
f.set_title('Me') | |
plot.imshow(wc) | |
else: | |
for i, user in enumerate(users): | |
wc = wordcloud.WordCloud(background_color='white', max_words=500, stopwords=wordcloud.STOPWORDS, font_path='/Library/Fonts/Futura.ttc', width=1000, height=1500) | |
wc.generate(messageContent[user]) | |
f = fig.add_subplot(1, len(users), i + 1) | |
f.axes.get_xaxis().set_visible(False) | |
f.axes.get_yaxis().set_visible(False) | |
f.set_title(user) | |
plot.imshow(wc) | |
plot.axis('off') | |
plot.show() | |
def allMessagesStats(cursor, plotMessageCount=False): | |
messages = list(map(lambda x: {'id': x[0], 'timestamp': x[1], 'author': x[2]}, cursor.execute('SELECT conversation_id, timestamp, author FROM fb_messages').fetchall())) | |
oldest, newest = datetime.datetime.max, datetime.datetime.min | |
daysMessageUserCounts = collections.defaultdict(lambda: collections.defaultdict(int)) | |
for message in messages: | |
date = datetime.datetime.fromtimestamp(message['timestamp'] / 1e3) | |
oldest = min(oldest, date) | |
newest = max(newest, date) | |
user = idToUser(message['author']) | |
daysMessageUserCounts[date.date()][user] += 1 | |
dayMessageCounts = dict(map(lambda x: (x[0], sum(x[1].values())), daysMessageUserCounts.items())) | |
missedDays = set(allDaysSpan(oldest, newest)) - set(dayMessageCounts.keys()) | |
daySpanLength = (newest - oldest).days | |
print('You have a total of %s messages spanning %s through %s (%s days)!' % (len(messages), oldest, newest, daySpanLength)) | |
print('That makes an average of %.02f messages per day unless you account for the %s days without conversation, which makes it %.02f per day.' % (len(messages) / daySpanLength, len(missedDays), len(messages) / (daySpanLength - len(missedDays)))) | |
print('The last day without messages was %s, %s days ago, and the most eventful day was %s with %s messages.' % (sorted(missedDays)[-1], (datetime.datetime.now().date() - (sorted(missedDays)[-1])).days, *max(dayMessageCounts.items(), key=operator.itemgetter(1)))) # py3.5 :) | |
# print('The last day without messages was %s, %s days ago, and the most eventful day was %s with %s messages.' % ((sorted(missedDays)[-1], (datetime.datetime.now().date() - (sorted(missedDays)[-1])).days) + max(dayMessageCounts.items(), key=operator.itemgetter(1)))) # < py3.5 :( | |
if termgraph: | |
print('\nConversations:\n') | |
conversationAuthors = functools.reduce(lambda s, x: (s[x[0]].add(x[1]), s)[1], cursor.execute('SELECT DISTINCT conversation_id, author FROM fb_messages').fetchall(), collections.defaultdict(set)) | |
user = idToUser(session.id if session.id else collections.Counter(itertools.chain.from_iterable(conversationAuthors.values())).most_common(1)[0][0]) | |
messageNums = sorted(map(lambda x: (', '.join(sorted(map(idToUser, conversationAuthors[x[0]]), key=lambda x: (x != user, x))), x[1]), cursor.execute('SELECT conversation_id, COUNT(*) FROM fb_messages GROUP BY conversation_id').fetchall()), key=operator.itemgetter(1)) | |
graph = termgraph.TermGraph(labels=list(map(operator.itemgetter(0), messageNums)), data=list(map(operator.itemgetter(1), messageNums))) | |
graph.render() | |
print('\nMessage authors:\n') | |
messageNums = sorted(filter(lambda x: not x[0].startswith('Unknown_'), map(lambda x: (idToUser(x[0]), x[1]), cursor.execute('SELECT author, COUNT(*) FROM fb_messages GROUP BY author').fetchall())), key=operator.itemgetter(1)) | |
graph = termgraph.TermGraph(labels=list(map(operator.itemgetter(0), messageNums)), data=list(map(operator.itemgetter(1), messageNums))) | |
graph.render() | |
if plotMessageCount: | |
dayMessageCounts.update(dict(map(lambda x: (x, 0), missedDays))) | |
dayMessageCountsList = sorted(dayMessageCounts.items(), key=operator.itemgetter(0)) | |
plt, = plot.plot(list(map(operator.itemgetter(0), dayMessageCountsList)), list(map(operator.itemgetter(1), dayMessageCountsList)), '.b-', label='All') | |
plot.xlabel('Date') | |
plot.ylabel('Number of Messages') | |
plot.title('Number of Messages over time') | |
plot.legend(handles=[plt]) | |
plot.show() | |
def updateConversation(cursor, conversationID, group=False, limit=500, save=True): | |
savedMessagesLength = None | |
cursor.execute('SELECT timestamp FROM fb_messages WHERE conversation_id = ? ORDER BY timestamp', (conversationID, )) | |
oldMessageTimestamps = list(map(lambda x: {'timestamp': x[0]}, cursor.fetchall())) | |
if len(oldMessageTimestamps): | |
savedMessagesLength = len(oldMessageTimestamps) | |
newMessages = getNewMessages(conversationID, oldMessages=oldMessageTimestamps, limit=limit, group=group) | |
else: | |
newMessages = getNewMessages(conversationID, group=group, limit=limit*10) | |
if save: | |
if len(newMessages) != 0: | |
insertMessages(cursor, newMessages, conversationID) | |
print('Inserted %s messages in database.' % len(newMessages)) | |
else: | |
print('No new messages to insert in database.') | |
return (len(newMessages) + len(oldMessageTimestamps), len(newMessages)) | |
def updateConversations(cursor, conversationIDs, limit=500, save=True): | |
messageNums = {} | |
numAddedMessages = 0 | |
for conversationID in conversationIDs: | |
numMessages, numNewMessages = updateConversation(cursor, conversationID, limit=limit, save=save) | |
messageNums[conversationID] = numMessages | |
numAddedMessages += numNewMessages | |
if termgraph: | |
messageNums = list(map(lambda x: (idToUser(x[0]), x[1]), sorted(messageNums.items(), key=operator.itemgetter(1)))) | |
graph = termgraph.TermGraph(labels=list(map(operator.itemgetter(0), messageNums)), data=list(map(operator.itemgetter(1), messageNums))) | |
graph.render() | |
return (numAddedMessages, sum(map(operator.itemgetter(1), messageNums))) | |
def updateAllIndividualConversations(cursor, limit=500, save=True): | |
conversationIDs = list(idToUserMap.keys()) | |
totalNumMessagesInserted, totalNumMessages = updateConversations(cursor, conversationIDs, limit=limit, save=save) | |
print('\nInserted %s new messages in all individual conversations for a total of %s messages.' % (totalNumMessagesInserted, totalNumMessages)) | |
def searchConversation(cursor, conversationID, query, pageSize=None, latestFirst=False, authorFilter=None, regularExpression=False, caseSensitive=False): | |
if not pageSize: | |
pageSize = shutil.get_terminal_size((80, 20)) | |
messages = getMessages(cursor, conversationID, query=query, regularExpression=regularExpression, caseSensitive=caseSensitive) | |
maxAuthorLength = max(map(len, map(idToUser, map(operator.itemgetter('author'), messages)))) | |
print('%s message results found for search query "%s".\n' % (len(messages), query)) | |
if termgraph: | |
regex = re.compile(query, flags=(0 if caseSensitive else re.IGNORECASE)) | |
instances, counts = zip(*reversed(sorted(collections.Counter(itertools.chain.from_iterable(map(lambda x: map(lambda x: '"%s"' % x, regex.findall(x['body'])), messages))).items(), key=operator.itemgetter(1)))) | |
graph = termgraph.TermGraph(labels=instances, data=counts) | |
graph.render() | |
print('\n') | |
authors, counts = zip(*collections.Counter(map(lambda x: idToUser(x['author']), messages)).items()) | |
graph = termgraph.TermGraph(labels=authors, data=counts) | |
graph.render() | |
input('\nPress enter to display results...') | |
os.system('cls' if os.name == 'nt' else 'clear') | |
if latestFirst: | |
messages = reversed(messages) | |
currentLine = 0 | |
for message in messages: | |
author = idToUser(message['author']) | |
if not authorFilter or author == idToUser(authorFilter): | |
if message['body']: | |
body = message['body'] | |
elif message['has_attachment']: | |
body = ' '.join(list(map(lambda x: '[%s %s]' % (x['attach_type'], x['url']), message['attachments']))) | |
else: | |
body = '[???]' | |
timestamp = datetime.datetime.fromtimestamp(message['timestamp'] / 1e3).strftime('%m-%d-%Y %H:%M:%S') | |
output = '(%s)\t%s: %s' % (timestamp, author.rjust(maxAuthorLength), body) | |
outputLines = sum(map(lambda x: math.ceil(len(x) / pageSize.columns), output.split('\n'))) | |
if currentLine + outputLines > (pageSize.lines - 2): | |
input('\nPress enter to continue...') | |
os.system('cls' if os.name == 'nt' else 'clear') | |
currentLine = 0 | |
print(output) | |
currentLine += outputLines | |
def login(password=None): | |
if not password: | |
password = getpass.getpass('Password (%s): ' % session.username) | |
s = requests.Session() | |
s.headers.update({ | |
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36' | |
}) | |
s.get('https://m.facebook.com/login.php') | |
loginPage = lxml.html.fromstring(s.get('https://m.facebook.com/login.php').text) | |
loginForm = dict(loginPage.forms[0].fields) | |
loginForm.update({ | |
'email': session.username, | |
'pass': password | |
}) | |
r = s.post('https://m.facebook.com/login.php?refsrc=https%3A%2F%2Fm.facebook.com%2Flogin.php&lwv=100&refid=9', data=loginForm) | |
dtsg = re.findall(r'\["DTSGInitialData",\[\],\{"token":"(.*?)"}', s.get('https://www.facebook.com/').text) | |
if not dtsg: | |
print('Login failed.') | |
sys.exit(1) | |
else: | |
dtsg = dtsg[0] | |
print('Login successful (dtsg: %s).\n' % dtsg) | |
return Session( | |
username=session.username, | |
id=s.cookies.get_dict()['c_user'], | |
session=s, | |
dtsg=dtsg | |
) | |
def browserLogin(username): | |
with contextlib.closing(webdriver.Chrome()) as driver: | |
driver.get('https://www.facebook.com/login.php') | |
actionChain = ActionChains(driver) | |
actionChain.move_to_element(driver.find_element_by_name('email')).send_keys(username) | |
actionChain.move_to_element(driver.find_element_by_name('pass')).click() | |
actionChain.perform() | |
dtsg = WebDriverWait(driver, 2 * 60).until( | |
lambda driver: | |
re.findall(r'\["DTSGInitialData",\[\],\{"token":"(.*?)"}', driver.page_source) if driver and 'checkpoint' not in driver.current_url else False | |
)[0] | |
cookies = dict(map(lambda x: (x['name'], x['value']), driver.get_cookies())) | |
return dtsg, cookies | |
def validConversation(idOrName): | |
if not idToUser(idToUser).startswith('Unknown') or idOrName in map(str, groups): | |
return str(idOrName).replace('fbid:', '') | |
elif userToId(idOrName): | |
return userToId(idOrName) | |
else: | |
raise argparse.ArgumentTypeError('%s is not a valid Facebook ID or recognized name' % idOrName) | |
def validNewPath(path): | |
if os.path.exists(path): | |
raise argparse.ArgumentTypeError('%s already contains a file' % path) | |
elif os.access(os.path.dirname(path), os.W_OK): | |
return path | |
else: | |
raise argparse.ArgumentTypeError('%s is not a valid path for new message database' % path) | |
def validCookies(cookieString): | |
try: | |
cookieString = cookieString.strip(';') | |
return dict(map(lambda x: tuple(x.strip().split('=')), cookieString.split(';'))) | |
except err: | |
raise argparse.ArgumentTypeError('%s is an invalid cookie string' % cookieString) | |
def main(args): | |
if args.init_db: | |
initDB(args.init_db) | |
return print('Database initialization complete.') | |
global session | |
session = Session(username=args.username, session=None, id=None, dtsg=None) | |
if getattr(args, 'browser', None): | |
try: | |
args.dtsg, args.cookies = browserLogin(args.username) | |
print('Login successful (dtsg: %s, cookies: %s).\n' % (args.dtsg, ';'.join(map(lambda x: '%s=%s' % x, args.cookies.items())))) | |
except Exception as e: | |
return print('Login failed: %s.' % repr(e)) | |
if args.dtsg and args.cookies: | |
s = requests.Session() | |
s.cookies.update(args.cookies) | |
session = Session( | |
username=args.username, | |
session=s, | |
id=args.cookies['c_user'], | |
dtsg=args.dtsg | |
) | |
elif args.password: | |
session = login(password=args.password) | |
print('Initalizing... ', end='', flush=True) | |
start_time = datetime.datetime.now() | |
db = tempfile.NamedTemporaryFile(mode='w+b') | |
shutil.copyfileobj(args.database, db) | |
db.flush() | |
conn = sqlite3.connect(db.name) | |
cursor = conn.cursor() | |
end_time = datetime.datetime.now() | |
print('complete. ({})\n'.format(end_time - start_time)) | |
save = not args.dry_run | |
if args.interactive: | |
raise NotImplementedError('Interactive mode is not yet implemented') | |
elif args.update: | |
for conversation in args.update: | |
if conversation in groups: | |
updateConversation(cursor, conversation, group=True, limit=(args.limit * args.group_limit_multiplier), save=save) | |
else: | |
updateConversation(cursor, conversation, limit=args.limit, save=save) | |
elif args.update_all or args.update_individuals or args.update_groups: | |
if args.update_all or args.update_individuals: | |
updateAllIndividualConversations(cursor, limit=args.limit, save=save) | |
else: | |
for group in groups: | |
updateConversation(cursor, group, group=True, limit=(args.limit * args.group_limit_multiplier), save=save) | |
elif args.search: | |
searchConversation( | |
cursor, | |
args.search[0], | |
args.search[1], | |
latestFirst=args.latest_first, | |
authorFilter=args.author, | |
regularExpression=args.regular_expression, | |
caseSensitive=args.case_sensitive | |
) | |
elif args.statistics is not None: | |
if len(args.statistics): | |
for conversation in args.statistics: | |
messagesStats( | |
getMessages(cursor, conversation), | |
plotMessageCount=getattr(args, 'plot_message_count', None), | |
plotCumulativeMessageCount=getattr(args, 'plot_cumulative_message_count', None), | |
wordClouds=getattr(args, 'word_clouds', None), | |
limitPlotToStreak=getattr(args, 'limit_plot_to_streak', None) | |
) | |
else: | |
allMessagesStats(cursor, plotMessageCount=getattr(args, 'plot_message_count', None)) | |
conn.commit() | |
conn.close() | |
if not (args.dry_run or args.search or (args.statistics is not None)): | |
print('\nSaving and compressing database... ', end='', flush=True) | |
start_time = datetime.datetime.now() | |
shutil.copyfile(args.database.name, args.database.name + '.tmp') | |
try: | |
os.remove(args.database.name) | |
with gzip.open(args.database.name, 'wb') as db_compressed: | |
with open(db.name, 'rb') as db_uncompressed: | |
db_compressed.writelines(db_uncompressed) | |
except: | |
print('Unable to resave file, restoring temporary copy.') | |
shutil.copyfile(args.database.name + '.tmp', args.database.name) | |
finally: | |
os.remove(args.database.name + '.tmp') | |
end_time = datetime.datetime.now() | |
print('complete. ({})'.format(end_time - start_time)) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser( | |
description='Download, archive, analyze and plot Facebook Messenger conversations (individual and group)', | |
epilog=', '.join(filter(operator.truth, [ | |
'selenium not installed, browser authentication disabled' if not webdriver else None, | |
'matplotlib not installed, message plotting disabled' if not matplotlib else None, | |
'wordcloud not installed, word clouds disabled' if not wordcloud else None | |
])) | |
) | |
parser.add_argument('-b', '--database', help='Path to gzip compressed SQLite message database', default='/Users/Sushain/Dropbox/Miscellaneous/chats.db.gz') | |
#root = '/Users/Sushain/Dropbox/Miscellaneous/chats/%s.json.xz' # r'C:\Users\Sushain\Dropbox\Miscellaneous\chats\%s.json.xz'# | |
modeNamedGroup = parser.add_argument_group('mode') | |
modeGroup = modeNamedGroup.add_mutually_exclusive_group(required=True) | |
modeGroup.add_argument('-i', '--interactive', action='store_true', help='Interactive mode') | |
modeGroup.add_argument('-k', '--update', type=validConversation, nargs='+', metavar='ID/NAME', help='Update a conversation') | |
modeGroup.add_argument('-a', '--update-all', action='store_true', help='Update all conversations') | |
modeGroup.add_argument('-n', '--update-individuals', action='store_true', help='Update all individual conversations') | |
modeGroup.add_argument('-g', '--update-groups', action='store_true', help='Update all group conversations') | |
modeGroup.add_argument('-s', '--search', metavar=('ID/NAME', 'QUERY'), nargs=2, help='Search a conversation') | |
modeGroup.add_argument('-t', '--statistics', type=validConversation, nargs='*', metavar='ID/NAME', help='Display conversation(s) statistics (all conversations statistics void of an argument)') | |
modeGroup.add_argument('--init-db', metavar='PATH', type=validNewPath, help='Initialize SQLite message database here') | |
authGroup = parser.add_argument_group('authentication', 'Conversation authentication options') | |
authGroup.add_argument('-u', '--username', default='sushain97', help='Facebook account username') | |
authGroup.add_argument('-p', '--password', help='Facebook account password') | |
if webdriver: | |
authGroup.add_argument('--browser', action='store_true', default=False, help='Facebook browser authentication') | |
authGroup.add_argument('--browser-timeout', type=int, default=2, help='Facebook browser authentication timeout in minutes') | |
authGroup.add_argument('--dtsg', help='Facebook dtsg value (must use --cookies as well)') | |
authGroup.add_argument('--cookies', type=validCookies, help='Facebook cookies value (must use --dtsg as well)') | |
statsGroup = parser.add_argument_group('statistics', 'Conversation statistics options') | |
if matplotlib: | |
statsGroup.add_argument('-P', '--plot-message-count', action='store_true', default=False, help='Plot individual message count over time') | |
statsGroup.add_argument('-Q', '--plot-cumulative-message-count', action='store_true', default=False, help='Plot individual cumulative message count over time') | |
statsGroup.add_argument('-S', '--limit-plot-to-streak', action='store_true', default=False, help='Limit message plot to time since streak started') | |
if wordcloud: | |
statsGroup.add_argument('-W', '--word-clouds', action='store_true', default=False, help='Display individual message word clouds') | |
searchGroup = parser.add_argument_group('search', 'Conversation search options') | |
searchGroup.add_argument('-F', '--latest-first', action='store_true', default=False, help='Show latest messages first') | |
searchGroup.add_argument('-I', '--regular-expression', action='store_true', default=False, help='Treat search query as regular expression') | |
searchGroup.add_argument('-A', '--author', type=validConversation, metavar='ID/NAME', help='Show only messages from this author') | |
searchGroup.add_argument('-C', '--case-sensitive', action='store_true', default=False, help='Case sensitive search') | |
downloadGroup = parser.add_argument_group('download', 'Conversation download options') | |
downloadGroup.add_argument('-L', '--limit', type=int, default=500, help='Message download limit') | |
downloadGroup.add_argument('-M', '--group-limit-multiplier', action='count', default=2, help='Multiply message download limit for groups') | |
downloadGroup.add_argument('-D', '--dry-run', action='store_true', default=False, help='Don\'t save to database') | |
args = parser.parse_args() | |
if not args.init_db: | |
args.database = gzip.open(args.database) | |
if bool(args.dtsg) ^ bool(args.cookies): | |
parser.error('--dtsg and --cookies must both be set for manual authentication.') | |
if args.search: | |
args.search[0] = validConversation(args.search[0]) | |
plotMessagesArgRequired = getattr(args, 'limit_plot_to_streak', False) | |
plottingMessages = getattr(args, 'plot_cumulative_message_count', False) or getattr(args, 'plot_message_count', False) | |
if plotMessagesArgRequired and not plottingMessages: | |
parser.error('--plot_message-count or --plot-cumulative-message-count must be set when --limit-plot-to-streak is.') | |
main(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment