A short project to run analytics on emails on a couple mailing lists
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import string | |
import pytz | |
import operator | |
from datetime import datetime | |
from nltk.corpus import stopwords | |
from collections import defaultdict | |
def sort_dict(dictionary): | |
return sorted(dictionary.items(), key=operator.itemgetter(1), reverse=True) | |
# EXTRACT DATA FROM .EMAILS FILES | |
LIST_OF_EMAILS = ['therearenosecrets@mit.edu', 'bentry@mit.edu', 'b-entry@mit.edu', 'letsshitoncourtney@mit.edu'] | |
TOTAL_BENTRY_POPULATION = 36 | |
emails = {} | |
for email in LIST_OF_EMAILS: | |
path = email.split('@')[0] + '.json' | |
with open(path) as f: | |
lines = f.readlines() | |
threads = map(lambda line: json.loads(line[:-1]), lines) | |
emails[email] = threads | |
# Start the analytics! | |
analytics_dump = {} | |
print '\nWhich B Entry alias is most popular?' | |
email_alias_counts = {} | |
for email, threads in emails.iteritems(): | |
num_messages = reduce(lambda x,y: x + len(y), threads, 0) | |
num_threads = len(threads) | |
email_alias_counts[email] = [num_threads, num_messages] | |
analytics_dump['email_alias_counts'] = email_alias_counts | |
print '\t',email,'has got',num_threads,'threads and',num_messages,'messages' | |
def enum(*sequential, **named): | |
enums = dict(zip(sequential, range(len(sequential))), **named) | |
reverse = dict((value, key) for key, value in enums.iteritems()) | |
enums['reverse_mapping'] = reverse | |
return type('Enum', (), enums) | |
print '\nWhat time is the busiest?' | |
time_counts = [0]*24 | |
day_counts = [0]*7 | |
#Monday=0, Tuesday=1, ..., Sunday=6 | |
month_counts = [0]*12 | |
Semester = enum(FALL=0,IAP=1,SPRING=2,SUMMER=3) | |
semesters = {Semester.FALL:0, Semester.IAP:0,Semester.SPRING:0,Semester.SUMMER:0} | |
all_semester_totals = {2013: semesters.copy(), 2014:semesters.copy()} | |
five_am_email = None | |
def get_semester(month, day): | |
if(month == 0): | |
return Semester.IAP | |
elif(1 <= month <= 4): | |
return Semester.SPRING | |
elif(5 <= month <= 6 or month == 7 and day < 26): | |
return Semester.SUMMER | |
else: | |
return Semester.FALL | |
def get_time_info(timestamp, get_datetime_obj=False): | |
date = datetime.fromtimestamp(int(timestamp)/1000.0, tz=pytz.timezone('US/Eastern')) | |
year = date.year | |
time = date.time() | |
weekday = date.weekday() | |
month = date.month - 1 # Convert month number to list index | |
day = date.day | |
return (year, month, day, weekday, time) | |
def get_school_year(year, month, day): | |
semester = get_semester(month, day) | |
if semester != Semester.FALL: year -= 1 | |
return year | |
semester_totals_2 = [0]*4 | |
encountered_messages = set() | |
for email, threads in emails.iteritems(): | |
for thread in threads: | |
for message in thread: | |
if str(message) in encountered_messages: | |
continue | |
else: encountered_messages.add(str(message)) | |
year, month, day, weekday, time = get_time_info(message['date']) | |
time_counts[time.hour] += 1 | |
day_counts[weekday] += 1 | |
month_counts[month] += 1 | |
semester = get_semester(month, day) | |
all_semester_totals[get_school_year(year, month, day)][semester] += 1 | |
if time.hour == 5: | |
five_am_email = message | |
total_13_14 = sum(all_semester_totals[2013].values()) | |
total_14_15 = sum(all_semester_totals[2014].values()) | |
total = total_13_14 + total_14_15 | |
print '\tEmail frequency across hours in the day', | |
print '\t\t', time_counts, sum(time_counts) | |
print '\tEmail frequency across the days of the week', | |
print '\t\t', day_counts, sum(day_counts) | |
print '\tEmail frequency across the months', | |
print '\t\t', month_counts, sum(month_counts) | |
print '\tEmail frequency by semester', | |
print '\t\t', all_semester_totals | |
print '\tTotal for 2013-2014:', total_13_14 | |
print '\tTotal for 2014-2015:', total_14_15 | |
print '\tAnd the only email ever sent between the hours of 5AM and 6AM in the last two years\n\n',five_am_email,'\n' | |
analytics_dump['time_counts'] = time_counts | |
analytics_dump['day_counts'] = day_counts | |
analytics_dump['month_counts'] = month_counts | |
analytics_dump['all_semester_totals'] = all_semester_totals | |
analytics_dump['total_13_14'] = total_13_14 | |
analytics_dump['total_14_15'] = total_14_15 | |
analytics_dump['total'] = total | |
# Analysis of sender email frequencies | |
ENTRY = { | |
# A KEY-VALUE STORE MAPPING A NAME TO A SET OF EMAILS USED BY INDIVIDUAL | |
# OMMITED FOR PRIVACY REASONS | |
} | |
# Extracted from last two years email senders, listed above | |
ENTRY_2013 = [# LIST OF NAMES; OMITTED FOR PRIVACY REASONS] | |
ENTRY_2014 = list(set(ENTRY.keys())-set(ENTRY_2013)) | |
NON_ENTRY = { | |
# A KEY-VALUE STORE MAPPING A NAME TO A SET OF EMAILS USED BY INDIVIDUAL | |
# OMMITED FOR PRIVACY REASONS | |
} | |
COMBINE = ENTRY.copy() | |
COMBINE.update(NON_ENTRY) | |
def find_sender_by_sendfield(query_email): | |
for person, email_ids in COMBINE.iteritems(): | |
for email in email_ids: | |
if email in query_email.lower(): | |
return person | |
print 'Could not find person', query_email | |
return None | |
def find_sender_by_name(query_name): | |
for person, email_ids in COMBINE.iteritems(): | |
names = person.split() | |
for name in names: | |
if name == "": continue | |
name = name.lower() | |
if name in query_name or query_name in name: | |
return person | |
print 'Could not find person', query_name | |
return None | |
emails_by_sender = defaultdict(list) | |
for email, threads in emails.iteritems(): | |
for thread in threads: | |
for message in thread: | |
emails_by_sender[find_sender_by_sendfield(message['sender'])].append(message) | |
sender_count = {} | |
for sender, sender_emails in emails_by_sender.iteritems(): | |
sender_count[sender] = len(sender_emails) | |
sender_count_by_year = {2013: defaultdict(int), 2014: defaultdict(int)} | |
for sender, sender_emails in emails_by_sender.iteritems(): | |
for email in sender_emails: | |
year, month, day, weekday, time = get_time_info(email['date']) | |
sender_count_by_year[get_school_year(year, month, day)][sender] += 1 | |
print '\nSenders sorted by total number of emails' | |
sorted_senders = sort_dict(sender_count) | |
print '\t',sorted_senders | |
print '\nSenders every year, sorted by total number of emails' | |
sorted_senders_2013 = sort_dict(sender_count_by_year[2013]) | |
sorted_senders_2014 = sort_dict(sender_count_by_year[2014]) | |
print '\t2013:' | |
print '\t',sorted_senders_2013 | |
print '\t2014:' | |
print '\t',sorted_senders_2014 | |
total_unique_senders_2013 = len(sorted_senders_2013) | |
total_unique_senders_2014 = len(sorted_senders_2014) | |
percent_never_sent_entry_email = 3.0/TOTAL_BENTRY_POPULATION | |
print '\nThe total number of unique senders in 2013 was', total_unique_senders_2013 | |
print 'The total number of unique senders in 2014 was', total_unique_senders_2014 | |
print 'In 2014-2015,', percent_never_sent_entry_email, '% of the entry has never sent an email' | |
analytics_dump['total_unique_senders_2013'] = total_unique_senders_2013 | |
analytics_dump['total_unique_senders_2014'] = total_unique_senders_2014 | |
analytics_dump['percent_never_sent_entry_email'] = percent_never_sent_entry_email | |
analytics_dump['sorted_senders'] = sorted_senders | |
analytics_dump['sorted_senders_2013'] = sorted_senders_2013 | |
analytics_dump['sorted_senders_2014'] = sorted_senders_2014 | |
GRTs_2013 = 'kim' | |
GRTs_2014 = ['cooper', 'christine'] | |
total_grt_traffic_2013 = len(emails_by_sender[GRTs_2013]) | |
total_grt_traffic_2014 = len(emails_by_sender[GRTs_2014[0]]) + len(emails_by_sender[GRTs_2014[1]]) | |
total_grt_traffic = total_grt_traffic_2013 + total_grt_traffic_2013 | |
percent_traffic_GRT = total_grt_traffic*100.0/total | |
percent_traffic_GRTs_2013 = total_grt_traffic_2013*100.0/total_13_14 | |
percent_traffic_GRTs_2014 = total_grt_traffic_2014*100.0/total_14_15 | |
analytics_dump['percent_traffic_GRT'] = percent_traffic_GRT | |
analytics_dump['percent_traffic_GRTs_2013'] = percent_traffic_GRTs_2013 | |
analytics_dump['percent_traffic_GRTs_2014'] = percent_traffic_GRTs_2014 | |
print '\nGRTs make up', percent_traffic_GRT, "% of traffic." | |
print 'In 2013-2014, GRTs contributed to', percent_traffic_GRTs_2013, "% of traffic." | |
print 'And in 2014-2015, GRTs contributed to', percent_traffic_GRTs_2014, "% of traffic." | |
n = 5 | |
top_n_senders_total = 0 | |
for i in xrange(n): | |
top_n_senders_total += sorted_senders[i][1] | |
top_n_senders_percent = top_n_senders_total*100.0/total | |
top_n_senders_population_percent = n*100.0/(len(ENTRY)+len(NON_ENTRY)) | |
print '\nThe top',n, 'senders make up', top_n_senders_percent, '% of all traffic, but only', top_n_senders_population_percent, '% of all senders' | |
analytics_dump['top_n_senders_percent'] = top_n_senders_percent | |
analytics_dump['top_n_senders_population_percent'] = top_n_senders_population_percent | |
analytics_dump['n'] = n | |
max_thread = [] | |
for email, threads in emails.iteritems(): | |
for thread in threads: | |
max_thread = thread if len(thread) > len(max_thread) else max_thread | |
print '\nThe thread with the most number of participants has length', len(max_thread), 'and has subject', max_thread[0]['subject'], 'and first message:',max_thread[0]['body'][0:50],'...' | |
analytics_dump['len_max_thread'] = len(max_thread) | |
analytics_dump['max_thread_starter'] = max_thread[0]['body'][0:50] | |
analytics_dump['max_thread_sender'] = find_sender_by_sendfield(max_thread[0]['sender']) | |
all_subject_words = defaultdict(int) | |
all_body_words = defaultdict(int) | |
all_words = defaultdict(int) | |
all_names_subject = defaultdict(int) | |
FORBIDDEN_SNIPPETS = ['mitedu', 'gmail', 'http', 'att000', 'image001', 'jpg', 'png'] | |
BORING_WORDS = set(stopwords.words('english')) | set(['re', 'fw', 'fwd', 'im', 'm', 'youre', '1', 'f', 'd', 'cc', 'pm', 'wrote', 'subject', 'sent', '2014', '30']) | |
NAMES = [] # LIST OF NAMES OMITTED FOR PRIVACY REASONS] | |
MONTHS = ['oct', 'may', 'january', 'october', 'jan', 'feb', 'sep'] | |
SIGNATURES = ['massachusetts', 'technology', 'institute', 'class'] | |
def fix_word(word): | |
return word.lower().strip().encode('ascii','ignore').translate(string.maketrans("",""), string.punctuation) | |
def should_filter(fixed_word, filter_names=True, filter_months=True, filter_number = True, filter_signatures = True): | |
if reduce(lambda x,y: x or y in fixed_word, FORBIDDEN_SNIPPETS, False): return True | |
if fixed_word[0:3] == 'cid' and len(fixed_word) > 20: return True | |
if fixed_word in BORING_WORDS: return True | |
if filter_names and fixed_word in NAMES: return True | |
if filter_months and fixed_word in MONTHS: return True | |
if filter_signatures and fixed_word in SIGNATURES: return True | |
if not fixed_word: return True | |
if filter_number: | |
try: | |
int(fixed_word) | |
return True | |
except: | |
return False | |
else: | |
return False | |
def is_name(word): | |
return word in (set(NAMES) & set(COMBINE.keys())) | |
for email, threads in emails.iteritems(): | |
for thread in threads: | |
for message in thread: | |
body_words = message['body'].split() | |
subject_words = message['subject'].split() | |
for word in body_words: | |
fixed_word = fix_word(word) | |
if should_filter(fixed_word): continue | |
all_body_words[fixed_word] += 1 | |
all_words[fixed_word] += 1 | |
for word in subject_words: | |
fixed_word = fix_word(word) | |
if is_name(fixed_word): all_names_subject[fixed_word] += 1 | |
if should_filter(fixed_word): continue | |
all_subject_words[fixed_word] += 1 | |
all_words[fixed_word] += 1 | |
print '\n','100 most frequenct words in body lines' | |
sorted_body_words = sort_dict(all_body_words) | |
print '\t',sorted_body_words[:100] | |
print '\n','100 most frequenct words in subject lines' | |
sorted_subject_words = sort_dict(all_subject_words) | |
print '\t',sorted_subject_words[:100] | |
print '\n','100 most frequenct names in subject' | |
sorted_subject_names = sort_dict(all_names_subject) | |
print '\t',sorted_subject_names | |
analytics_dump['sorted_body_words'] = sorted_body_words | |
analytics_dump['sorted_subject_words'] = sorted_subject_words | |
analytics_dump['sorted_subject_names'] = sorted_subject_words | |
profanity_file = open('google_bad_words.json','r') | |
profanity_line = profanity_file.read() | |
bad_words = set(json.loads(profanity_line).keys()) | |
used_bad_words = set(all_words.keys()) & bad_words | |
print '\nWe\'ve used these instances of profanity in our emails' | |
for word in used_bad_words: | |
print '\t',word, 'used',all_words[word],'times' | |
profane_people = defaultdict(list) | |
for email, threads in emails.iteritems(): | |
for thread in threads: | |
for message in thread: | |
body_words = message['body'].split() | |
for word in body_words: | |
fixed_word = fix_word(word) | |
if fixed_word in bad_words: | |
profane_people[find_sender_by_sendfield(message['sender'])].append(fixed_word) | |
profanity_count = {} | |
for person, profanity_list in profane_people.iteritems(): | |
profanity_count[person] = len(profanity_list) | |
sorted_profanity_count = sort_dict(profanity_count) | |
sorted_profanity_people_words = [] | |
print '\nThe profanity used distributed across each person:' | |
for person, count in sorted_profanity_count: | |
sorted_profanity_people_words.append((person, profane_people[person])) | |
print '\t',person, 'used', profane_people[person] | |
analytics_dump['sorted_profanity_people_words'] = sorted_profanity_people_words | |
analytics_file_path = 'results.json' | |
analytics_file = open(analytics_file_path, 'w') | |
analytics_file.write(json.dumps(analytics_dump)) | |
analytics_file.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Uses the Gmail API (https://developers.google.com/gmail/api/) | |
# to collect emails from specified email addresses, parse, and save | |
# them as a series of files, one per email, as a list of | |
# JSON-formatted message threads | |
import httplib2 | |
import time | |
import os | |
import base64 | |
import json | |
import datetime | |
from apiclient import errors | |
from apiclient import discovery | |
import oauth2client | |
from oauth2client import client | |
from oauth2client import tools | |
try: | |
import argparse | |
flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args() | |
except ImportError: | |
flags = None | |
print 'FLAGS', flags | |
SCOPES = 'https://www.googleapis.com/auth/gmail.readonly' | |
CLIENT_SECRET_FILE = 'client_secret.json' | |
APPLICATION_NAME = 'Skanda\'s Data Playground' | |
def get_credentials(): | |
"""Gets valid user credentials from storage. | |
If nothing has been stored, or if the stored credentials are invalid, | |
the OAuth2 flow is completed to obtain the new credentials. | |
Returns: | |
Credentials, the obtained credential. | |
""" | |
home_dir = os.path.expanduser('~') | |
credential_dir = os.path.join(home_dir, '.credentials') | |
if not os.path.exists(credential_dir): | |
os.makedirs(credential_dir) | |
credential_path = os.path.join(credential_dir, 'gmail-quickstart.json') | |
store = oauth2client.file.Storage(credential_path) | |
credentials = store.get() | |
if not credentials or credentials.invalid: | |
flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES) | |
flow.user_agent = APPLICATION_NAME | |
if flags: | |
credentials = tools.run_flow(flow, store, flags) | |
else: # Needed only for compatability with Python 2.6 | |
credentials = tools.run(flow, store) | |
print 'Storing credentials to ' + credential_path | |
return credentials | |
def analyze_part(part, headers, unix_date, emails): | |
if 'data' in part['body']: | |
b64_body = part['body']['data'].encode('ascii','ignore') | |
else: | |
print '\t\tDiscarding because no data', part | |
return | |
body = base64.b64decode(b64_body, '-_') | |
valid_receiver = None | |
for header in headers: | |
if header['name'] == 'From': | |
sender = header['value'].encode('ascii','ignore') | |
elif header['name'] == 'To': | |
receiver = header['value'].encode('ascii','ignore') | |
for email in emails: | |
if email in receiver.lower(): | |
valid_receiver = email | |
elif header['name'].lower() == 'cc': | |
receiver = header['value'].encode('ascii','ignore') | |
for email in emails: | |
if email in receiver.lower(): | |
valid_receiver = email | |
elif header['name'] == 'Subject': | |
subject = header['value'].encode('ascii','ignore') | |
if valid_receiver: | |
seconds = unix_date/1000.0 | |
real_date = datetime.datetime.fromtimestamp(seconds).strftime('%Y-%m-%d %H:%M:%S.%f') | |
print '\t\t', 'Saving message from ', real_date, 'with subject ', '\"' + subject + '\"' | |
message = {'date':unix_date, 'sender':sender.strip(), 'receiver':valid_receiver.strip(), | |
'subject':subject.strip(), 'body':body.strip().replace('\n',' ').replace('\r',' ')} | |
return message | |
else: | |
print '\t\tDiscarding because no target emails match', receiver.lower() | |
def get_thread(service, thread_id, emails): | |
raw_thread = service.users().threads().get(userId='me', id=thread_id).execute() | |
raw_messages = raw_thread['messages'] | |
time.sleep(1) | |
print '\tGetting thread with ID', thread_id, 'and', len(raw_messages), 'messages' | |
messages = [] | |
for raw_message in raw_messages: | |
print '\t\tAnalyzing a message...' | |
unix_date = int(raw_message['internalDate']) | |
payload = raw_message['payload'] | |
parts = [] | |
parts.append(payload) | |
for part in parts: | |
if 'parts' in part.keys(): | |
parts.extend(part['parts']) | |
if 'body' in part.keys() and part['mimeType'] == 'text/plain': | |
message = analyze_part(part, payload['headers'], unix_date, emails) | |
if message: | |
messages.append(message) | |
break | |
return messages | |
def write_thread_to_file(my_thread, file): | |
json_string = json.dumps(my_thread) | |
file.write(json_string + '\n') | |
def recordEmailThreadBunch(service, email, emails): | |
print 'Getting email threads from', email | |
filename = email.split('@')[0] + '.json' | |
file = open(filename, 'w') | |
query = 'to:' + email | |
threads_on_page = service.users().threads().list(userId='me', q=query).execute() | |
try: | |
if 'threads' in threads_on_page: | |
for thread_snippet in threads_on_page['threads']: | |
thread = get_thread(service, thread_snippet['id'], emails) | |
write_thread_to_file(thread, file) | |
while 'nextPageToken' in threads_on_page: | |
print '\tTurning page for email',email | |
page_token = threads_on_page['nextPageToken'] | |
threads_on_page = service.users().threads().list(userId='me', pageToken=page_token, q=query).execute() | |
for thread_snippet in threads_on_page['threads']: | |
thread = get_thread(service, thread_snippet['id'], emails) | |
write_thread_to_file(thread, file) | |
except errors.HttpError, error: | |
print 'An error occurred: %s' % error | |
print 'Closing write file for', email | |
file.close() | |
def set_up_credentials(): | |
print 'Setting up credentials...' | |
credentials = get_credentials() | |
http = credentials.authorize(httplib2.Http()) | |
service = discovery.build('gmail', 'v1', http=http) | |
print 'Finished setting up credentials...' | |
return service | |
def main(): | |
service = set_up_credentials() | |
emails = ['therearenosecrets@mit.edu', 'letsshitoncourtney@mit.edu', 'b-entry@mit.edu', 'bentry@mit.edu'] | |
for email in emails: | |
recordEmailThreadBunch(service, email, emails) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I lived in a group that has roughly four mailing lists that we use to communicate. I was interested to see a few statistics on the emails: most frequent senders, most frequented mailing lists, more profane senders, and more. So I spent a weekend to collect all the emails sent to these mailing lists over two years, and crunch a few numbers over these emails.
What was collected?
I took a look at email activity on four different mailing lists over the last two years.
What was analyzed? What are some results?
For privacy reasons, I can’t disclose all the email results of our living group, but here are some interesting tidbits:
Feel free to browse through the scripts used to collect and analyze the data. I was planning on adding some visualizations to the statistics, but I was wrapping up my internship around the same time, and it quickly became crunch time at my work before I got around to topping the analysis off with graphs. Please enjoy.