-
-
Save purneshwar/f53bd07beb23bc06a47579e473ef91fe to your computer and use it in GitHub Desktop.
Convert/Parse Google Takeout/Export Data Hangouts/Chat into individual conversations
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Run this in the same directory as the Hangouts.json file generated by Google Takeout / Data Export tool. | |
# python3 hangouts.py | |
import json | |
import datetime | |
import os | |
import shutil | |
import re | |
chat_dir = "hangouts" | |
if os.path.exists(chat_dir): | |
print("Hangouts directory already exists. Should I remove it before proceeding? y/n") | |
if input().lower() == "y": | |
print("Removing the hangouts directory.") | |
shutil.rmtree(chat_dir) | |
else: | |
print("Exiting.") | |
quit() | |
class User: | |
def __init__(self, id, name): | |
self.id = id | |
self.name = name | |
def setName(self, name): | |
self.name = name | |
def getId(self): | |
return self.id | |
def getName(self): | |
return self.name | |
class Message: | |
def __init__(self, id, sender_id, sender_name, timestamp, text): | |
self.id = id | |
self.sender_id = sender_id | |
self.sender_name = sender_name | |
self.timestamp = datetime.datetime(1970,1,1) + datetime.timedelta(microseconds=int(timestamp)) + datetime.timedelta(hours=2) # Webkit time to UTC to GMT+2 conversion | |
self.text = text | |
def display(self): | |
return "[" + self.timestamp.strftime('%Y-%m-%d %H:%M:%S') + "] " + self.sender_name + ": " + self.text | |
class ConversationSet: | |
def __init__(self): | |
self.conversations = dict() | |
def getParticipantNameById(self, id): | |
for c in self.conversations: | |
if self.conversations[c].getParticipantById(id) is not None: | |
return self.conversations[c].getParticipantById(id).name | |
return "Unknown" | |
def addConversationParticipants(self, id, json_participant_data): | |
self.conversations[id] = Conversation(id) | |
for participant in json_participant_data: | |
p_id = participant["id"]["gaia_id"] | |
if "fallback_name" in participant: | |
self.conversations[id].addParticipant(p_id, participant["fallback_name"]) | |
else: | |
self.conversations[id].addParticipant(p_id, "") | |
def addConversationEvents(self, id, json_participant_data, json_event_data): | |
conversation = self.getConversationById(id) | |
for participant in json_participant_data: | |
p_id = participant["id"]["gaia_id"] | |
if "fallback_name" in participant: | |
self.conversations[id].setParticipantName(p_id, participant["fallback_name"]) | |
else: | |
self.conversations[id].setParticipantName(p_id, self.getParticipantNameById(p_id)) | |
for event in json_event_data: | |
if event["event_type"] == "REGULAR_CHAT_MESSAGE": | |
message_text_segments = [] | |
if "segment" in event["chat_message"]["message_content"]: | |
for segment in event["chat_message"]["message_content"]["segment"]: | |
message_text_segments.append(segment["text"]) | |
self.conversations[id].addMessage( | |
event["event_id"], | |
event["sender_id"]["gaia_id"], | |
self.conversations[id].getParticipantById(event["sender_id"]["gaia_id"]).name, | |
event["timestamp"], | |
"".join(message_text_segments) | |
) | |
def getConversations(self): | |
list = [] | |
for c in self.conversations: | |
list.append(self.conversations[c]) | |
return list | |
def getConversationById(self, id): | |
for c in self.getConversations(): | |
if c.id == id: | |
return c | |
return None | |
class Conversation: | |
def __init__(self, id): | |
self.id = id | |
self.participants = dict() | |
self.messages = [] | |
def addParticipant(self, id, name): | |
if id not in self.participants: | |
self.participants[id] = User(id, name) | |
def addMessage(self, id, sender_id, sender_name, timestamp, text): | |
self.messages.append(Message(id, sender_id, sender_name, timestamp, text)) | |
def getMessages(self): | |
return self.messages | |
def setParticipantName(self, id, name): | |
if id in self.participants: | |
self.participants[id].setName(name) | |
def getId(self): | |
return self.id | |
def getParticipants(self): | |
list = [] | |
for p in self.participants: | |
list.append(self.participants[p]) | |
return list | |
def getParticipantById(self, id): | |
if id in self.participants: | |
return self.participants[id] | |
return None | |
def participantCount(self): | |
return len(self.participants) | |
def get_valid_filename(s): | |
# https://github.com/django/django/blob/master/django/utils/text.py#L218 | |
s = str(s).strip().replace(' ', '_') | |
return re.sub(r'(?u)[^-\w.]', '', s) | |
print("Processing Hangouts.json ..") | |
with open('Hangouts.json', 'r') as f: | |
hangouts_dict = json.load(f) | |
conversations = ConversationSet() | |
for hangout in hangouts_dict["conversations"]: | |
if "conversation" in hangout: | |
conversations.addConversationParticipants( | |
hangout["conversation"]["conversation_id"]["id"], | |
hangout["conversation"]["conversation"]["participant_data"] | |
) | |
for hangout in hangouts_dict["conversations"]: | |
if "conversation" in hangout: | |
conversations.addConversationEvents( | |
hangout["conversation"]["conversation_id"]["id"], | |
hangout["conversation"]["conversation"]["participant_data"], | |
hangout["events"] | |
) | |
os.makedirs(chat_dir) | |
for c in conversations.getConversations(): | |
participants = [] | |
f_name = "-" | |
for p in c.getParticipants(): | |
f_name = p.name | |
participants.append(p.name) | |
if c.participantCount() > 1: | |
f_name = " and ".join(participants) | |
if os.path.isfile(chat_dir + "/" + get_valid_filename(f_name) + ".txt"): | |
f_name = f_name + '_2' | |
c_file = open(chat_dir + "/" + get_valid_filename(f_name) + ".txt", "w") | |
for m in c.getMessages(): | |
c_file.write(m.display() + "\n") | |
c_file.close() | |
print("Done. Check the hangouts directory for chat output files.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment