Skip to content

Instantly share code, notes, and snippets.

@colorstain
Created February 7, 2013 00:17
Show Gist options
  • Save colorstain/4727194 to your computer and use it in GitHub Desktop.
Save colorstain/4727194 to your computer and use it in GitHub Desktop.
Small program that extracts chat logs from gmail and saves them in a csv file.
__author__ = 'colorstain@gmail.com'
import email
from datetime import datetime, timedelta
from imapclient import IMAPClient
from bs4 import BeautifulSoup #make sure it's the latest version
import csv
def parse_chats(xml):
soup = BeautifulSoup(xml, features='xml')
messages = []
message_tags = soup.find_all('message')
if message_tags:
for msg in message_tags:
if msg.body:
message = {}
message['from'] = msg['from']
#use default value of 0 when there's no timestamp
timestamp = msg.get('time-stamp', 0)
try:
time = int(timestamp)
except ValueError:
print 'Error parsing time-stamp'
message['date'] = datetime.fromtimestamp(time//1000)
message['body'] = msg.body.text.encode('utf-8').strip().replace('\n',' ')
messages.append(message)
else:
print 'no messages in the xml'
return messages
def write_to_csv(file_name, chats):
with open(file_name + '.csv', 'wb') as file:
fieldnames = ['date', 'from', 'body']
dw = csv.DictWriter(file, delimiter=',',
fieldnames=fieldnames)
dw.writeheader()
for chat in chats:
dw.writerow(chat)
#GMAIL information
HOST = 'imap.gmail.com'
USERNAME = 'your-username'
PW = 'your-pw'
#setting up the cutoff date for chats
today = datetime.today()
cutoff = today - timedelta(days = 365)
#initiazing connection
server = IMAPClient(HOST, use_uid = True, ssl= True)
server.login(USERNAME, PW)
#selecting the Chats folder
#make sure to enable it in the labels page of
#the GMail settings
info = server.select_folder('[Gmail]/Chats', readonly=True)
print '%d messages in Chats' % info['EXISTS']
messages = server.search(['SINCE %s' % cutoff.strftime('%d-%b-%Y')])
print '%d messages since %s' % (len(messages), cutoff.strftime('%d-%b-%Y'))
print 'Fetching messages'
response = server.fetch(messages, ['RFC822'])
chats = [] #container for all the chats
print 'Parsing messages'
for msgid, data in response.iteritems():
msg_string = data['RFC822']
msg = email.message_from_string(msg_string) #parsing message
#response is a multipart message so we have to iterate
#through it to find the xml response which has all
#our chats
for part in msg.walk():
if part.get_content_type() == 'text/xml':
#body is in utf-8 so we need to decode
body = part.get_payload(decode=True)
xml = body.decode('utf-8') #this is the body containing the chats
parsed_chats = parse_chats(xml)
chats.extend(parsed_chats)
#savig it to a csv file
filename = '%s %s' % (USERNAME, cutoff.strftime('%d-%b-%Y'))
print 'Writing to %s.csv' % filename
write_to_csv(filename, chats)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment