Skip to content

Instantly share code, notes, and snippets.

@aussetg
Forked from sushain97/fb.py
Created January 23, 2017 10:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aussetg/3dbdc615d844b36253805936fea21efe to your computer and use it in GitHub Desktop.
Save aussetg/3dbdc615d844b36253805936fea21efe to your computer and use it in GitHub Desktop.
Downloads, archives, analyzes and plots Facebook Messenger conversations (individual and group)
#!/usr/bin/env python3
import collections
import copy
import datetime
import functools
import itertools
import json
import lzma
import gzip
import operator
import os
import pprint
import sys
import time
import re
import sqlite3
import glob
import tempfile
import shutil
import getpass
import argparse
import math
import contextlib
import requests
import lxml.html
import numpy
try:
import matplotlib
import matplotlib.pyplot as plot
except ImportError:
matplotlib = plot = print('Failed to import matplotlib, plotting will not be available.')
try:
import wordcloud
except ImportError:
wordcloud = print('Failed to import wordcloud, word clouds will not be available.')
try:
import termgraph
except ImportError:
termgraph = print('Failed to import termgraph, terminal graphs will not be displayed.')
try:
import selenium
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
except:
webdriver = print('Failed to import selenium, browser authentication will not be available.')
idToUserMap = {
'100004322110944': 'Rowena',
'1338262658': 'Sushain',
'100004252695708': 'Michelle N.',
'100009196845865': 'Brittany',
'100003355055997': 'Karen',
'100000534453859': 'Ashley',
'100004476751719': 'Benji',
'1626215140': 'Maxine',
'777993547': 'Jean',
'1311112684': 'Keerthana',
'1412264090': 'Michelle H.',
'100002628181062': 'Pallavi',
'100001184052364': 'Shreya',
'100000284933876': 'Ani',
'100002398916527': 'Amisha',
'100006790448156': 'Serena',
'100002878482600': 'Tiffany',
'100002576434633': 'Snigdha',
'1333603699': 'Saloni',
'1841753743': 'Christina',
'100003127069904': 'Tiffany Do',
'1253817276': 'Alexander',
'100000986269083': 'Prachi',
'100000241495175': 'Eric'
}
groups = [494248544089735, 1513200892278424, 322604171221575, 1021123947914529,
879550675408978, 940010492714431, 1700273163527834, 1097674336985252]
profanity = [r'\bfuck+(?:ing|ed|er)?\b', r'\b(?:dip)?shit+(?:ty+)?\b', r'\bdamn(?:it+)?\b',
r'\bgoddamn\b', r'\bdick\b', r'\bbullshit+\b', r'\bbastard\b', r'\bhell+\b',
r'\bbitch(?:ass)?\b', r'\bass+\b', r'\ba(?:ss)?hole\b', r'\bmotherfucker\b']
Session = collections.namedtuple('Session', ['username', 'id', 'session', 'dtsg'])
session = Session(None, None, None, None)
def idToUser(userid):
userid = str(userid).replace('fbid:', '')
if userid in idToUserMap:
return idToUserMap[userid]
else:
return 'Unknown_' + str(hash(str(userid))).replace('-', '')[:4]
def userToId(user):
return dict(zip(map(str.lower, idToUserMap.values()), idToUserMap.keys())).get(user.lower())
fields = {
'text': ['thread_id', 'threading_id', 'author_email', 'message_id', 'folder', 'source', 'html_body', 'author', 'subject', 'body', 'action_id', 'action_type', 'other_user_fbid', 'forward_count', 'offline_threading_id', 'log_message_type', 'log_message_body', 'location_text'],
'json': ['forward_message_ids', 'raw_attachments', 'attachments', 'ephemeral_ttl_mode', 'ranges', 'source_tags', 'tags', 'log_message_data', 'coordinates'],
'boolean': ['has_attachment', 'is_spoof_warning', 'is_filtered_content', 'is_filtered_content_invalid_app', 'is_filtered_content_account', 'is_forward', 'is_unread', 'is_filtered_content_bh', 'is_filtered_content_quasar', 'is_sponsored']
}
def insertMessages(cursor, messages, conversationID):
conversationID = int(conversationID)
numFields = 4 + len(fields['text']) + len(fields['json']) + len(fields['boolean'])
def formatMessageJSON(conversationID, message):
messageFields = [None, session.username, conversationID]
if int(message['thread_fbid']) != conversationID:
# TODO: Figure out why this is happening and how to properly filter
return print(conversationID, 'ignoring message', message['thread_fbid'])
assert int(message['thread_fbid']) == conversationID
del message['thread_fbid']
for text_field in fields['text']:
messageFields.append(str(message.pop(text_field, '')))
for json_field in fields['json']:
messageFields.append(json.dumps(message.pop(json_field, None)))
for boolean_field in fields['boolean']:
messageFields.append(bool(message.pop(boolean_field, False)))
messageFields.append(int(message['timestamp']))
del message['timestamp']
for field in ['timestamp_relative', 'timestamp_datetime', 'timestamp_absolute', 'timestamp_time_passed']:
del message[field]
for field in ['platform_xmd', 'meta_ranges', 'commerce_message_type', 'customizations', 'message_source', 'montage_reply_data', 'skip_bump_thread']:
message.pop(field, None)
if len(message.items()):
print(message)
assert len(message.items()) == 0
return messageFields
formattedMessages = list(filter(lambda x: x is not None, map(lambda x: formatMessageJSON(conversationID, x), messages)))
cursor.executemany('INSERT INTO fb_messages VALUES (%s)' % ', '.join(['?'] * numFields), formattedMessages)
def initDB(dbPath, fbChatsPath=None):
conn = sqlite3.connect(dbPath)
cursor = conn.cursor()
cursor.execute('''CREATE TABLE fb_messages (
id INTEGER PRIMARY KEY AUTOINCREMENT,
account TEXT,
conversation_id UNSIGNED BIG INT,
%s,
%s,
%s,
timestamp DATETIME
)''' % (
', '.join(map(lambda x: '%s TEXT' % x, fields['text'])),
', '.join(map(lambda x: '%s_json TEXT' % x, fields['json'])),
', '.join(map(lambda x: '%s BOOLEAN' % x, fields['boolean']))
))
if fbChatsPath:
for fname in glob.glob(fbChatsPath % '*'):
conversationID = int(os.path.basename(fname).replace('.json.xz', ''))
with lzma.open(fname) as f:
messages = json.loads(f.read().decode('utf-8'))
insertMessages(cursor, messages, conversationID)
conn.commit()
conn.close()
with gzip.open(dbPath.replace('.db', '.db.gz'), 'wb') as db_compressed:
with open(dbPath, 'rb') as db_uncompressed:
db_compressed.writelines(db_uncompressed)
os.remove(dbPath)
def getNewMessages(conversationID, group=False, oldMessages=None, limit=2000):
global session
if not (session.session and session.dtsg and session.id):
session = login()
s, dtsg = session.session, session.dtsg
print(('\nFetching messages from conversation %s' % conversationID) + (' (%s).' % idToUser(conversationID) if not group else '.'))
if oldMessages:
newestMessageDate = datetime.datetime.fromtimestamp(oldMessages[-1]['timestamp'] / 1e3)
print('%s messages currently downloaded.' % len(oldMessages))
startTime = datetime.datetime.now()
newMessages = []
offset = 0
messagesType = 'user_ids' if not group else 'thread_fbids'
newestTimestamp = ''
failures = 0
while True:
try:
data = {
'messages[%s][%s][offset]' % (messagesType, conversationID): offset,
'messages[%s][%s][limit]' % (messagesType, conversationID): limit,
'messages[%s][%s][timestamp]' % (messagesType, conversationID): newestTimestamp,
'client': 'mercury',
'__user': session.id,
'__a': '1',
'__dyn': '7AmanEzUFlym5Q9UoHbgWy1m9JaUK5EK8GAFp8yupFLO0xBxvyui9zob4q8zUK5Uc-dy88awF-qp7yoSy28Oi9x2rmEWfSiVWxeUlG4oCi4aDixa26inzpoS6rCgKmLF5Dxm68',
'__req': '6',
'fb_dtsg': dtsg,
'ttstamp': '265817110078561197411111711395',
'__rev': '2046448'
}
#pprint.pprint(data)
t = s.post('https://www.facebook.com/ajax/mercury/thread_info.php', data=data).text
t = json.loads(t[t.index('{'):t.rindex('}')+1])['payload']['actions']
if newestTimestamp:
t = t[:-1]
newestTimestamp = t[0]['timestamp']
except TypeError:
failures += 1
print('Failed to fetch messages at offset %s with limit %s (failure #%s).' % (offset, limit, failures))
if failures > 2:
print('Changing limit from %s to %s.' % (limit, limit / 2))
limit /= 2
failures = 0
if limit < 10:
print('Giving up after fetching %s messages.' % len(newMessages))
continue
failures = 0
newMessages = t + newMessages
offset += limit
oldestMessageDate = datetime.datetime.fromtimestamp(newMessages[0]['timestamp'] / 1e3)
if len(t) < limit or (oldMessages and oldestMessageDate < newestMessageDate):
print('Completed fetching %s messages in conversation %s.' % (len(newMessages), conversationID))
break
else:
print('Fetched %s messages, offset at %s, fetched %s messages so far.' % (limit, offset, len(newMessages)))
if oldMessages:
newMessages = list(filter(lambda x: datetime.datetime.fromtimestamp(x['timestamp'] / 1e3) > newestMessageDate, newMessages))
print('Added %s messages to existing %s messages for a total of %s.' % (len(newMessages), len(oldMessages), len(newMessages) + len(oldMessages)))
endTime = datetime.datetime.now()
print('The data retrieval took {} seconds.'.format(endTime - startTime))
return newMessages
def getMessages(cursor, conversationID, query=None, regularExpression=False, caseSensitive=False):
if query and not regularExpression:
if caseSensitive:
messages = cursor.execute('''
SELECT author, timestamp, body, has_attachment, attachments_json
FROM fb_messages
WHERE conversation_id = ? AND body LIKE ?
ORDER BY timestamp''',
(conversationID, '%%%s%%' % query)
).fetchall()
else:
messages = cursor.execute('''
SELECT author, timestamp, body, has_attachment, attachments_json
FROM fb_messages
WHERE conversation_id = ? AND LOWER(body) LIKE ?
ORDER BY timestamp''',
(conversationID, '%%%s%%' % query.lower())
).fetchall()
else:
messages = cursor.execute('''
SELECT author, timestamp, body, has_attachment, attachments_json
FROM fb_messages
WHERE conversation_id = ?
ORDER BY timestamp''',
(conversationID, )
).fetchall()
if regularExpression:
regex = re.compile(query, flags=(0 if caseSensitive else re.IGNORECASE))
messages = list(filter(lambda x: bool(regex.search(x[2])), messages))
return list(map(lambda x: {
'author': x[0],
'timestamp': x[1],
'body': x[2],
'has_attachment': bool(x[3]),
'attachments': json.loads(x[4])
}, messages))
def allDaysSpan(oldest, newest):
allDays = []
startDate = oldest.date()
while startDate <= newest.date():
allDays.append(startDate)
startDate = startDate + datetime.timedelta(days=1)
return allDays
#@profile
def messagesStats(messages, plotMessageCount=False, plotCumulativeMessageCount=False, wordClouds=False, limitPlotToStreak=False):
startTime = datetime.datetime.now()
oldest, newest = datetime.datetime.max, datetime.datetime.min
messageCounts = collections.defaultdict(lambda: collections.Counter({'sticker': 0, 'text': 0, 'other': 0}))
messageContents = collections.defaultdict(list)
daysSpoken = set()
daysMessages = collections.defaultdict(lambda: collections.defaultdict(int))
stickerCounts = collections.defaultdict(lambda: collections.Counter())
responseTimes = collections.defaultdict(list)
messageStreaks = []
users = set()
lastMessageUser = None
currentMessageStreak = 0
lastTimestamp = None
for message in messages:
date = datetime.datetime.fromtimestamp(message['timestamp'] / 1e3)
oldest = min(oldest, date)
newest = max(newest, date)
user = idToUser(message['author'])
daysMessages[date.date()][user] += 1
daysSpoken.add(date.date())
users.add(user)
if lastMessageUser == user:
currentMessageStreak += 1
lastTimestamp = date
else:
if lastMessageUser:
messageStreaks.append((lastMessageUser, currentMessageStreak))
lastMessageUser = user
currentMessageStreak = 1
if lastTimestamp:
responseTimes[user].append((date - lastTimestamp).total_seconds())
lastTimestamp = date
if 'body' in message:
text = message['body']
messageCounts[user]['all'] += 1
if text and len(text):
messageCounts[user]['text'] += 1
messageContents[user].append(text)
else:
if message['has_attachment'] and 'attach_type' in message['attachments'][0] and message['attachments'][0]['attach_type'] == 'sticker':
messageCounts[user]['sticker'] += 1
stickerCounts[user][message['attachments'][0]['url']] += 1
else:
messageCounts[user]['other'] += 1
else:
pass # print(message['log_message_body'])
print('Conversations amongst %s between %s and %s:\n' % (' & '.join(users), oldest, newest))
messageContent = dict(map(lambda x: (x[0], '\n'.join(x[1])), messageContents.items()))
totalCounts = collections.Counter({'sticker': 0, 'text': 0})
for person, counts in messageCounts.items():
totalCounts['sticker'] += counts['sticker']
totalCounts['text'] += counts['text']
totalCounts['all'] += counts['all']
print('%s sent %s total messages, %s text messages (%.2f%%) and %s stickers (%.2f%%). On average, the text messages were %.2f characters long which makes for a total of %s characters.' % (person, counts['all'], counts['text'], float(counts['text']) / counts['all'] * 100, counts['sticker'], float(counts['sticker']) / counts['all'] * 100, float(len(messageContent[person])) / counts['text'], len(messageContent[person])))
topMessages = dict(map(lambda x: (x[0], sorted(x[1].items(), key=operator.itemgetter(1), reverse=True)[0][0]), daysMessages.items()))
topMessagesCounts = sorted(list(collections.Counter(topMessages.values()).items()), key=operator.itemgetter(1), reverse=True)
if len(topMessagesCounts) == 1:
print('%s talked the most every day...' % topMessagesCounts[0][0])
else:
print('%s talks the most, with %s day(s) when they sent the most messages, and %s is the quiet one with %s day(s).' % (topMessagesCounts[0][0], topMessagesCounts[0][1], topMessagesCounts[1][0], topMessagesCounts[1][1]))
print('\nSo, a total of %s messages, %s text messages (%.2f%%) and %s stickers (%.2f%%).' % (totalCounts['all'], totalCounts['text'], float(totalCounts['text'])/totalCounts['all'] * 100, totalCounts['sticker'], float(totalCounts['sticker']) / totalCounts['all'] * 100))
allDays = allDaysSpan(oldest, newest)
print('That makes for an average of %.2f messages per day!' % (float(totalCounts['all']) / len(allDays)))
print('Over the span of %s day(s), %s day(s) went without conversation (%.2f%%).' % (len(allDays), len(set(allDays) - daysSpoken), float(len(set(allDays) - daysSpoken)) / len(allDays) * 100))
print('So, if we take that into account, it makes for an average of %.2f messages on days with conversation!' % (float(totalCounts['all']) / len(daysSpoken)))
profanityCounts = collections.defaultdict(dict)
for user in users:
for word in profanity:
matches = re.findall(word, messageContent[user], flags=re.IGNORECASE)
if matches:
mostCommon = collections.Counter(map(str.lower, matches)).most_common(1)[0]
profanityCounts[user][mostCommon[0]] = mostCommon[1]
profanityTotalCounts = list(reversed(sorted(list(map(lambda x: (x[0], sum(x[1].values())), profanityCounts.items())), key=operator.itemgetter(1))))
print('\n%s has the potty mouth with %s profane word(s) said whereas %s.' % (profanityTotalCounts[0][0], profanityTotalCounts[0][1], ', '.join(map(lambda x: '%s has said %s profane word(s)' % x, profanityTotalCounts[1:]))))
for user in sorted(users, key=lambda x: - dict(profanityTotalCounts).get(x, 0)):
userProfanityCounts = list(reversed(sorted(profanityCounts[user].items(), key=operator.itemgetter(1))))
if userProfanityCounts:
print('%s\'s profanity of choice seems to be "%s" (%s occurences), they\'re also fans of %s.' % (user, userProfanityCounts[0][0], userProfanityCounts[0][1], ', '.join(map(lambda x: '"%s" (%s)' % x, userProfanityCounts[1:])) or 'apparently not much else'))
else:
print('%s hasn\'t been the slightest bit profane.' % user)
print('\nJust in case you\'re curious, the most eventful day was %s, when %s messages were sent :D' % tuple(max(map(lambda x: (x[0], functools.reduce(lambda s, a: s + a[1], x[1].items(), 0)), daysMessages.items()), key=operator.itemgetter(1))))
longestseq, currentseq = [], []
for day in sorted(list(daysSpoken)):
if len(currentseq) > len(longestseq):
longestseq = copy.copy(currentseq)
if currentseq and currentseq[-1] + datetime.timedelta(days=1) == day:
currentseq.append(day)
else:
currentseq = [day]
if len(currentseq) > len(longestseq):
longestseq = copy.copy(currentseq)
print('The longest streak of days with at least one message lasted %s days, from %s to %s!' % (len(longestseq), longestseq[0], longestseq[-1]))
if currentseq and datetime.datetime.now().date() - currentseq[-1] <= datetime.timedelta(days=1):
print('On the other hand, the current streak is %s days, from %s to %s.' % (len(currentseq), currentseq[0], currentseq[-1]))
else:
print('On the other hand, the current streak is 0 days, you haven\'t conversed since %s :(' % currentseq[-1])
print('\nNow, on to stickers. There were an average of %.2f stickers used on days with conversation!' % (float(totalCounts['sticker']) / len(daysSpoken)))
for user in users:
print('Out of %s\'s %s stickers, the five most used were: ' % (user, messageCounts[user]['sticker']) + ', '.join(list(map(lambda x: '%s (%s)' % x, stickerCounts[user].most_common(5)))))
messageStreaksPerUser = {}
for user in users:
messageStreaksPerUser[user] = collections.Counter(map(operator.itemgetter(1), filter(lambda x: x[0] == user, messageStreaks)))
if len(users) == 2 and len(messageStreaks) > 1:
print('\nSince there are only two people in this conversation, we can do some more calculations!')
user1 = messageStreaks[0][0]
user2 = messageStreaks[1][0]
sum1, num1, sum2, num2 = 0, 0, 0, 0
lastMessageStreak = (None, 0)
for messageStreak in messageStreaks:
if lastMessageStreak[0] == user1 and messageStreak[0] == user2:
sum1 += messageStreak[1] / lastMessageStreak[1]
num1 += 1
elif lastMessageStreak[0] == user2 and messageStreak[0] == user1:
sum2 += messageStreak[1] / lastMessageStreak[1]
num2 += 1
lastMessageStreak = messageStreak
print('%s sends %.2f consecutive message on average and for each message, %s responds with %.2f messages on average.' % (user1, numpy.average(list(messageStreaksPerUser[user1].keys()), weights=list(messageStreaksPerUser[user1].values())), user2, sum1 / num1))
print('On the other hand, %s sends %.2f consecutive message on average and for each message, %s responds with %.2f messages on average.' % (user2, numpy.average(list(messageStreaksPerUser[user2].keys()), weights=list(messageStreaksPerUser[user2].values())), user1, sum2 / num2))
print('When %s sends a message, %s tends to respond in %.1f seconds (median response time).' % (user1, user2, numpy.median(responseTimes[user2])))
print('On the other hand, when %s sends a message, %s tends to respond in %.1f seconds (median response time).' % (user2, user1, numpy.median(responseTimes[user1])))
endTime = datetime.datetime.now()
print('\nThe data compilation took {} seconds.'.format(endTime - startTime))
colors = ['b', 'r', 'g', 'c']
if plotMessageCount or plotCumulativeMessageCount:
daysMessagesList = sorted(daysMessages.items(), key=operator.itemgetter(0))
fig = plot.figure()
subplotCount = len(list(filter(operator.truth, [plotMessageCount, plotCumulativeMessageCount])))
if plotMessageCount:
ax1 = fig.add_subplot(subplotCount, 1, 1)
plot.xlabel('Date')
plot.ylabel('Quantity')
plot.title('Number of Messages')
plots1 = []
if plotCumulativeMessageCount:
ax2 = fig.add_subplot(subplotCount, 1, 2 if plotMessageCount else 1)
plot.xlabel('Date')
plot.ylabel('Quantity')
plot.title('Number of Messages over Time')
plots2 = []
for i, user in enumerate(users):
userMessages = list(map(lambda x: (x[0], x[1][user]), filter(lambda y: user in y[1], daysMessagesList)))
userDays = list(map(operator.itemgetter(0), userMessages))
for day in filter(lambda x: x not in userDays, allDays):
userMessages.append((day, 0))
userMessages = sorted(userMessages, key=operator.itemgetter(0))
if limitPlotToStreak:
userMessages = list(filter(lambda x: x[0] >= longestseq[0] and x[0] <= longestseq[-1], userMessages))
if plotMessageCount:
plt, = ax1.plot(*zip(*userMessages), '.%s-' % colors[i % len(colors)], label=user)
plots1.append(plt)
if plotCumulativeMessageCount:
cumulativeUserMessages = list(itertools.accumulate(userMessages, func=lambda x, y: (y[0], x[1] + y[1])))
plt, = ax2.plot(*zip(*cumulativeUserMessages), '.%s-' % colors[i % len(colors)], label=user+' (cumulative)')
plots2.append(plt)
if plotMessageCount:
ax1.legend(handles=plots1)
if plotCumulativeMessageCount:
ax2.legend(handles=plots2, loc='lower right')
plot.show()
if wordClouds:
wordcloud.STOPWORDS.update(["T", "t", "P", ":P", "im", "p", 'http', 'https', 'd', 'o'])
wordcloud.STOPWORDS.update(["u", "ur", "i"])
wordcloud.STOPWORDS.update(["T", "t", "P", ":P", "lol", "LOL", "yeah", "okay", "oh", "im", "p", 'http', 'https', 'd', 'o', 'want', 'go', 'png', 'skc'])
wordcloud.STOPWORDS.update(['dont', 'hes', 'whens', 'weve', 'hed', 'theres', 'havent', 'theyll', 'whos', 'theyd', 'youve', 'well', 'theyve', 'wont', 'mustnt', 'isnt', 'ill', 'whys', 'youd', 'wasnt', 'shouldnt', 'youre', 'arent', 'id', 'werent', 'im', 'cant', 'hadnt', 'couldnt', 'doesnt', 'hows', 'its', 'wheres', 'ive', 'didnt', 'whats', 'heres', 'theyre', 'hasnt', 'wouldnt', 'wed', 'shant', 'lets', 'hell', 'shed', 'youll', 'were', 'shes', 'thats'])
wordcloud.STOPWORDS.update(['think', 'make', 'one', 'wait', 'people'])
fig = plot.figure()
if len(users) > 5:
allContent = '\n'.join(messageContent.values())
wc = wordcloud.WordCloud(background_color='white', max_words=500, stopwords=wordcloud.STOPWORDS, font_path='/Library/Fonts/Futura.ttc', width=1000, height=1500)
wc.generate(allContent)
f = fig.add_subplot(1, 2, 1)
f.axes.get_xaxis().set_visible(False)
f.axes.get_yaxis().set_visible(False)
f.set_title('Everyone')
plot.imshow(wc)
if session.id:
wc = wordcloud.WordCloud(background_color='white', max_words=500, stopwords=wordcloud.STOPWORDS, font_path='/Library/Fonts/Futura.ttc', width=1000, height=1500)
wc.generate(messageContent[idToUser(session.id)])
f = fig.add_subplot(1, 2, 2)
f.axes.get_xaxis().set_visible(False)
f.axes.get_yaxis().set_visible(False)
f.set_title('Me')
plot.imshow(wc)
else:
for i, user in enumerate(users):
wc = wordcloud.WordCloud(background_color='white', max_words=500, stopwords=wordcloud.STOPWORDS, font_path='/Library/Fonts/Futura.ttc', width=1000, height=1500)
wc.generate(messageContent[user])
f = fig.add_subplot(1, len(users), i + 1)
f.axes.get_xaxis().set_visible(False)
f.axes.get_yaxis().set_visible(False)
f.set_title(user)
plot.imshow(wc)
plot.axis('off')
plot.show()
def allMessagesStats(cursor, plotMessageCount=False):
messages = list(map(lambda x: {'id': x[0], 'timestamp': x[1], 'author': x[2]}, cursor.execute('SELECT conversation_id, timestamp, author FROM fb_messages').fetchall()))
oldest, newest = datetime.datetime.max, datetime.datetime.min
daysMessageUserCounts = collections.defaultdict(lambda: collections.defaultdict(int))
for message in messages:
date = datetime.datetime.fromtimestamp(message['timestamp'] / 1e3)
oldest = min(oldest, date)
newest = max(newest, date)
user = idToUser(message['author'])
daysMessageUserCounts[date.date()][user] += 1
dayMessageCounts = dict(map(lambda x: (x[0], sum(x[1].values())), daysMessageUserCounts.items()))
missedDays = set(allDaysSpan(oldest, newest)) - set(dayMessageCounts.keys())
daySpanLength = (newest - oldest).days
print('You have a total of %s messages spanning %s through %s (%s days)!' % (len(messages), oldest, newest, daySpanLength))
print('That makes an average of %.02f messages per day unless you account for the %s days without conversation, which makes it %.02f per day.' % (len(messages) / daySpanLength, len(missedDays), len(messages) / (daySpanLength - len(missedDays))))
print('The last day without messages was %s, %s days ago, and the most eventful day was %s with %s messages.' % (sorted(missedDays)[-1], (datetime.datetime.now().date() - (sorted(missedDays)[-1])).days, *max(dayMessageCounts.items(), key=operator.itemgetter(1)))) # py3.5 :)
# print('The last day without messages was %s, %s days ago, and the most eventful day was %s with %s messages.' % ((sorted(missedDays)[-1], (datetime.datetime.now().date() - (sorted(missedDays)[-1])).days) + max(dayMessageCounts.items(), key=operator.itemgetter(1)))) # < py3.5 :(
if termgraph:
print('\nConversations:\n')
conversationAuthors = functools.reduce(lambda s, x: (s[x[0]].add(x[1]), s)[1], cursor.execute('SELECT DISTINCT conversation_id, author FROM fb_messages').fetchall(), collections.defaultdict(set))
user = idToUser(session.id if session.id else collections.Counter(itertools.chain.from_iterable(conversationAuthors.values())).most_common(1)[0][0])
messageNums = sorted(map(lambda x: (', '.join(sorted(map(idToUser, conversationAuthors[x[0]]), key=lambda x: (x != user, x))), x[1]), cursor.execute('SELECT conversation_id, COUNT(*) FROM fb_messages GROUP BY conversation_id').fetchall()), key=operator.itemgetter(1))
graph = termgraph.TermGraph(labels=list(map(operator.itemgetter(0), messageNums)), data=list(map(operator.itemgetter(1), messageNums)))
graph.render()
print('\nMessage authors:\n')
messageNums = sorted(filter(lambda x: not x[0].startswith('Unknown_'), map(lambda x: (idToUser(x[0]), x[1]), cursor.execute('SELECT author, COUNT(*) FROM fb_messages GROUP BY author').fetchall())), key=operator.itemgetter(1))
graph = termgraph.TermGraph(labels=list(map(operator.itemgetter(0), messageNums)), data=list(map(operator.itemgetter(1), messageNums)))
graph.render()
if plotMessageCount:
dayMessageCounts.update(dict(map(lambda x: (x, 0), missedDays)))
dayMessageCountsList = sorted(dayMessageCounts.items(), key=operator.itemgetter(0))
plt, = plot.plot(list(map(operator.itemgetter(0), dayMessageCountsList)), list(map(operator.itemgetter(1), dayMessageCountsList)), '.b-', label='All')
plot.xlabel('Date')
plot.ylabel('Number of Messages')
plot.title('Number of Messages over time')
plot.legend(handles=[plt])
plot.show()
def updateConversation(cursor, conversationID, group=False, limit=500, save=True):
savedMessagesLength = None
cursor.execute('SELECT timestamp FROM fb_messages WHERE conversation_id = ? ORDER BY timestamp', (conversationID, ))
oldMessageTimestamps = list(map(lambda x: {'timestamp': x[0]}, cursor.fetchall()))
if len(oldMessageTimestamps):
savedMessagesLength = len(oldMessageTimestamps)
newMessages = getNewMessages(conversationID, oldMessages=oldMessageTimestamps, limit=limit, group=group)
else:
newMessages = getNewMessages(conversationID, group=group, limit=limit*10)
if save:
if len(newMessages) != 0:
insertMessages(cursor, newMessages, conversationID)
print('Inserted %s messages in database.' % len(newMessages))
else:
print('No new messages to insert in database.')
return (len(newMessages) + len(oldMessageTimestamps), len(newMessages))
def updateConversations(cursor, conversationIDs, limit=500, save=True):
messageNums = {}
numAddedMessages = 0
for conversationID in conversationIDs:
numMessages, numNewMessages = updateConversation(cursor, conversationID, limit=limit, save=save)
messageNums[conversationID] = numMessages
numAddedMessages += numNewMessages
if termgraph:
messageNums = list(map(lambda x: (idToUser(x[0]), x[1]), sorted(messageNums.items(), key=operator.itemgetter(1))))
graph = termgraph.TermGraph(labels=list(map(operator.itemgetter(0), messageNums)), data=list(map(operator.itemgetter(1), messageNums)))
graph.render()
return (numAddedMessages, sum(map(operator.itemgetter(1), messageNums)))
def updateAllIndividualConversations(cursor, limit=500, save=True):
conversationIDs = list(idToUserMap.keys())
totalNumMessagesInserted, totalNumMessages = updateConversations(cursor, conversationIDs, limit=limit, save=save)
print('\nInserted %s new messages in all individual conversations for a total of %s messages.' % (totalNumMessagesInserted, totalNumMessages))
def searchConversation(cursor, conversationID, query, pageSize=None, latestFirst=False, authorFilter=None, regularExpression=False, caseSensitive=False):
if not pageSize:
pageSize = shutil.get_terminal_size((80, 20))
messages = getMessages(cursor, conversationID, query=query, regularExpression=regularExpression, caseSensitive=caseSensitive)
maxAuthorLength = max(map(len, map(idToUser, map(operator.itemgetter('author'), messages))))
print('%s message results found for search query "%s".\n' % (len(messages), query))
if termgraph:
regex = re.compile(query, flags=(0 if caseSensitive else re.IGNORECASE))
instances, counts = zip(*reversed(sorted(collections.Counter(itertools.chain.from_iterable(map(lambda x: map(lambda x: '"%s"' % x, regex.findall(x['body'])), messages))).items(), key=operator.itemgetter(1))))
graph = termgraph.TermGraph(labels=instances, data=counts)
graph.render()
print('\n')
authors, counts = zip(*collections.Counter(map(lambda x: idToUser(x['author']), messages)).items())
graph = termgraph.TermGraph(labels=authors, data=counts)
graph.render()
input('\nPress enter to display results...')
os.system('cls' if os.name == 'nt' else 'clear')
if latestFirst:
messages = reversed(messages)
currentLine = 0
for message in messages:
author = idToUser(message['author'])
if not authorFilter or author == idToUser(authorFilter):
if message['body']:
body = message['body']
elif message['has_attachment']:
body = ' '.join(list(map(lambda x: '[%s %s]' % (x['attach_type'], x['url']), message['attachments'])))
else:
body = '[???]'
timestamp = datetime.datetime.fromtimestamp(message['timestamp'] / 1e3).strftime('%m-%d-%Y %H:%M:%S')
output = '(%s)\t%s: %s' % (timestamp, author.rjust(maxAuthorLength), body)
outputLines = sum(map(lambda x: math.ceil(len(x) / pageSize.columns), output.split('\n')))
if currentLine + outputLines > (pageSize.lines - 2):
input('\nPress enter to continue...')
os.system('cls' if os.name == 'nt' else 'clear')
currentLine = 0
print(output)
currentLine += outputLines
def login(password=None):
if not password:
password = getpass.getpass('Password (%s): ' % session.username)
s = requests.Session()
s.headers.update({
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36'
})
s.get('https://m.facebook.com/login.php')
loginPage = lxml.html.fromstring(s.get('https://m.facebook.com/login.php').text)
loginForm = dict(loginPage.forms[0].fields)
loginForm.update({
'email': session.username,
'pass': password
})
r = s.post('https://m.facebook.com/login.php?refsrc=https%3A%2F%2Fm.facebook.com%2Flogin.php&lwv=100&refid=9', data=loginForm)
dtsg = re.findall(r'\["DTSGInitialData",\[\],\{"token":"(.*?)"}', s.get('https://www.facebook.com/').text)
if not dtsg:
print('Login failed.')
sys.exit(1)
else:
dtsg = dtsg[0]
print('Login successful (dtsg: %s).\n' % dtsg)
return Session(
username=session.username,
id=s.cookies.get_dict()['c_user'],
session=s,
dtsg=dtsg
)
def browserLogin(username):
with contextlib.closing(webdriver.Chrome()) as driver:
driver.get('https://www.facebook.com/login.php')
actionChain = ActionChains(driver)
actionChain.move_to_element(driver.find_element_by_name('email')).send_keys(username)
actionChain.move_to_element(driver.find_element_by_name('pass')).click()
actionChain.perform()
dtsg = WebDriverWait(driver, 2 * 60).until(
lambda driver:
re.findall(r'\["DTSGInitialData",\[\],\{"token":"(.*?)"}', driver.page_source) if driver and 'checkpoint' not in driver.current_url else False
)[0]
cookies = dict(map(lambda x: (x['name'], x['value']), driver.get_cookies()))
return dtsg, cookies
def validConversation(idOrName):
if not idToUser(idToUser).startswith('Unknown') or idOrName in map(str, groups):
return str(idOrName).replace('fbid:', '')
elif userToId(idOrName):
return userToId(idOrName)
else:
raise argparse.ArgumentTypeError('%s is not a valid Facebook ID or recognized name' % idOrName)
def validNewPath(path):
if os.path.exists(path):
raise argparse.ArgumentTypeError('%s already contains a file' % path)
elif os.access(os.path.dirname(path), os.W_OK):
return path
else:
raise argparse.ArgumentTypeError('%s is not a valid path for new message database' % path)
def validCookies(cookieString):
try:
cookieString = cookieString.strip(';')
return dict(map(lambda x: tuple(x.strip().split('=')), cookieString.split(';')))
except err:
raise argparse.ArgumentTypeError('%s is an invalid cookie string' % cookieString)
def main(args):
if args.init_db:
initDB(args.init_db)
return print('Database initialization complete.')
global session
session = Session(username=args.username, session=None, id=None, dtsg=None)
if getattr(args, 'browser', None):
try:
args.dtsg, args.cookies = browserLogin(args.username)
print('Login successful (dtsg: %s, cookies: %s).\n' % (args.dtsg, ';'.join(map(lambda x: '%s=%s' % x, args.cookies.items()))))
except Exception as e:
return print('Login failed: %s.' % repr(e))
if args.dtsg and args.cookies:
s = requests.Session()
s.cookies.update(args.cookies)
session = Session(
username=args.username,
session=s,
id=args.cookies['c_user'],
dtsg=args.dtsg
)
elif args.password:
session = login(password=args.password)
print('Initalizing... ', end='', flush=True)
start_time = datetime.datetime.now()
db = tempfile.NamedTemporaryFile(mode='w+b')
shutil.copyfileobj(args.database, db)
db.flush()
conn = sqlite3.connect(db.name)
cursor = conn.cursor()
end_time = datetime.datetime.now()
print('complete. ({})\n'.format(end_time - start_time))
save = not args.dry_run
if args.interactive:
raise NotImplementedError('Interactive mode is not yet implemented')
elif args.update:
for conversation in args.update:
if conversation in groups:
updateConversation(cursor, conversation, group=True, limit=(args.limit * args.group_limit_multiplier), save=save)
else:
updateConversation(cursor, conversation, limit=args.limit, save=save)
elif args.update_all or args.update_individuals or args.update_groups:
if args.update_all or args.update_individuals:
updateAllIndividualConversations(cursor, limit=args.limit, save=save)
else:
for group in groups:
updateConversation(cursor, group, group=True, limit=(args.limit * args.group_limit_multiplier), save=save)
elif args.search:
searchConversation(
cursor,
args.search[0],
args.search[1],
latestFirst=args.latest_first,
authorFilter=args.author,
regularExpression=args.regular_expression,
caseSensitive=args.case_sensitive
)
elif args.statistics is not None:
if len(args.statistics):
for conversation in args.statistics:
messagesStats(
getMessages(cursor, conversation),
plotMessageCount=getattr(args, 'plot_message_count', None),
plotCumulativeMessageCount=getattr(args, 'plot_cumulative_message_count', None),
wordClouds=getattr(args, 'word_clouds', None),
limitPlotToStreak=getattr(args, 'limit_plot_to_streak', None)
)
else:
allMessagesStats(cursor, plotMessageCount=getattr(args, 'plot_message_count', None))
conn.commit()
conn.close()
if not (args.dry_run or args.search or (args.statistics is not None)):
print('\nSaving and compressing database... ', end='', flush=True)
start_time = datetime.datetime.now()
shutil.copyfile(args.database.name, args.database.name + '.tmp')
try:
os.remove(args.database.name)
with gzip.open(args.database.name, 'wb') as db_compressed:
with open(db.name, 'rb') as db_uncompressed:
db_compressed.writelines(db_uncompressed)
except:
print('Unable to resave file, restoring temporary copy.')
shutil.copyfile(args.database.name + '.tmp', args.database.name)
finally:
os.remove(args.database.name + '.tmp')
end_time = datetime.datetime.now()
print('complete. ({})'.format(end_time - start_time))
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Download, archive, analyze and plot Facebook Messenger conversations (individual and group)',
epilog=', '.join(filter(operator.truth, [
'selenium not installed, browser authentication disabled' if not webdriver else None,
'matplotlib not installed, message plotting disabled' if not matplotlib else None,
'wordcloud not installed, word clouds disabled' if not wordcloud else None
]))
)
parser.add_argument('-b', '--database', help='Path to gzip compressed SQLite message database', default='/Users/Sushain/Dropbox/Miscellaneous/chats.db.gz')
#root = '/Users/Sushain/Dropbox/Miscellaneous/chats/%s.json.xz' # r'C:\Users\Sushain\Dropbox\Miscellaneous\chats\%s.json.xz'#
modeNamedGroup = parser.add_argument_group('mode')
modeGroup = modeNamedGroup.add_mutually_exclusive_group(required=True)
modeGroup.add_argument('-i', '--interactive', action='store_true', help='Interactive mode')
modeGroup.add_argument('-k', '--update', type=validConversation, nargs='+', metavar='ID/NAME', help='Update a conversation')
modeGroup.add_argument('-a', '--update-all', action='store_true', help='Update all conversations')
modeGroup.add_argument('-n', '--update-individuals', action='store_true', help='Update all individual conversations')
modeGroup.add_argument('-g', '--update-groups', action='store_true', help='Update all group conversations')
modeGroup.add_argument('-s', '--search', metavar=('ID/NAME', 'QUERY'), nargs=2, help='Search a conversation')
modeGroup.add_argument('-t', '--statistics', type=validConversation, nargs='*', metavar='ID/NAME', help='Display conversation(s) statistics (all conversations statistics void of an argument)')
modeGroup.add_argument('--init-db', metavar='PATH', type=validNewPath, help='Initialize SQLite message database here')
authGroup = parser.add_argument_group('authentication', 'Conversation authentication options')
authGroup.add_argument('-u', '--username', default='sushain97', help='Facebook account username')
authGroup.add_argument('-p', '--password', help='Facebook account password')
if webdriver:
authGroup.add_argument('--browser', action='store_true', default=False, help='Facebook browser authentication')
authGroup.add_argument('--browser-timeout', type=int, default=2, help='Facebook browser authentication timeout in minutes')
authGroup.add_argument('--dtsg', help='Facebook dtsg value (must use --cookies as well)')
authGroup.add_argument('--cookies', type=validCookies, help='Facebook cookies value (must use --dtsg as well)')
statsGroup = parser.add_argument_group('statistics', 'Conversation statistics options')
if matplotlib:
statsGroup.add_argument('-P', '--plot-message-count', action='store_true', default=False, help='Plot individual message count over time')
statsGroup.add_argument('-Q', '--plot-cumulative-message-count', action='store_true', default=False, help='Plot individual cumulative message count over time')
statsGroup.add_argument('-S', '--limit-plot-to-streak', action='store_true', default=False, help='Limit message plot to time since streak started')
if wordcloud:
statsGroup.add_argument('-W', '--word-clouds', action='store_true', default=False, help='Display individual message word clouds')
searchGroup = parser.add_argument_group('search', 'Conversation search options')
searchGroup.add_argument('-F', '--latest-first', action='store_true', default=False, help='Show latest messages first')
searchGroup.add_argument('-I', '--regular-expression', action='store_true', default=False, help='Treat search query as regular expression')
searchGroup.add_argument('-A', '--author', type=validConversation, metavar='ID/NAME', help='Show only messages from this author')
searchGroup.add_argument('-C', '--case-sensitive', action='store_true', default=False, help='Case sensitive search')
downloadGroup = parser.add_argument_group('download', 'Conversation download options')
downloadGroup.add_argument('-L', '--limit', type=int, default=500, help='Message download limit')
downloadGroup.add_argument('-M', '--group-limit-multiplier', action='count', default=2, help='Multiply message download limit for groups')
downloadGroup.add_argument('-D', '--dry-run', action='store_true', default=False, help='Don\'t save to database')
args = parser.parse_args()
if not args.init_db:
args.database = gzip.open(args.database)
if bool(args.dtsg) ^ bool(args.cookies):
parser.error('--dtsg and --cookies must both be set for manual authentication.')
if args.search:
args.search[0] = validConversation(args.search[0])
plotMessagesArgRequired = getattr(args, 'limit_plot_to_streak', False)
plottingMessages = getattr(args, 'plot_cumulative_message_count', False) or getattr(args, 'plot_message_count', False)
if plotMessagesArgRequired and not plottingMessages:
parser.error('--plot_message-count or --plot-cumulative-message-count must be set when --limit-plot-to-streak is.')
main(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment