Created
February 7, 2013 00:17
-
-
Save colorstain/4727194 to your computer and use it in GitHub Desktop.
Small program that extracts chat logs from gmail and saves them in a csv file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'colorstain@gmail.com' | |
import email | |
from datetime import datetime, timedelta | |
from imapclient import IMAPClient | |
from bs4 import BeautifulSoup #make sure it's the latest version | |
import csv | |
def parse_chats(xml): | |
soup = BeautifulSoup(xml, features='xml') | |
messages = [] | |
message_tags = soup.find_all('message') | |
if message_tags: | |
for msg in message_tags: | |
if msg.body: | |
message = {} | |
message['from'] = msg['from'] | |
#use default value of 0 when there's no timestamp | |
timestamp = msg.get('time-stamp', 0) | |
try: | |
time = int(timestamp) | |
except ValueError: | |
print 'Error parsing time-stamp' | |
message['date'] = datetime.fromtimestamp(time//1000) | |
message['body'] = msg.body.text.encode('utf-8').strip().replace('\n',' ') | |
messages.append(message) | |
else: | |
print 'no messages in the xml' | |
return messages | |
def write_to_csv(file_name, chats): | |
with open(file_name + '.csv', 'wb') as file: | |
fieldnames = ['date', 'from', 'body'] | |
dw = csv.DictWriter(file, delimiter=',', | |
fieldnames=fieldnames) | |
dw.writeheader() | |
for chat in chats: | |
dw.writerow(chat) | |
#GMAIL information | |
HOST = 'imap.gmail.com' | |
USERNAME = 'your-username' | |
PW = 'your-pw' | |
#setting up the cutoff date for chats | |
today = datetime.today() | |
cutoff = today - timedelta(days = 365) | |
#initiazing connection | |
server = IMAPClient(HOST, use_uid = True, ssl= True) | |
server.login(USERNAME, PW) | |
#selecting the Chats folder | |
#make sure to enable it in the labels page of | |
#the GMail settings | |
info = server.select_folder('[Gmail]/Chats', readonly=True) | |
print '%d messages in Chats' % info['EXISTS'] | |
messages = server.search(['SINCE %s' % cutoff.strftime('%d-%b-%Y')]) | |
print '%d messages since %s' % (len(messages), cutoff.strftime('%d-%b-%Y')) | |
print 'Fetching messages' | |
response = server.fetch(messages, ['RFC822']) | |
chats = [] #container for all the chats | |
print 'Parsing messages' | |
for msgid, data in response.iteritems(): | |
msg_string = data['RFC822'] | |
msg = email.message_from_string(msg_string) #parsing message | |
#response is a multipart message so we have to iterate | |
#through it to find the xml response which has all | |
#our chats | |
for part in msg.walk(): | |
if part.get_content_type() == 'text/xml': | |
#body is in utf-8 so we need to decode | |
body = part.get_payload(decode=True) | |
xml = body.decode('utf-8') #this is the body containing the chats | |
parsed_chats = parse_chats(xml) | |
chats.extend(parsed_chats) | |
#savig it to a csv file | |
filename = '%s %s' % (USERNAME, cutoff.strftime('%d-%b-%Y')) | |
print 'Writing to %s.csv' % filename | |
write_to_csv(filename, chats) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment